import gradio as gr
import pandas as pd

banner_url = "https://huggingface.co/spaces/elmresearchcenter/open_universal_arabic_asr_leaderboard/resolve/main/banner.png"
BANNER = f'<div style="display: flex; justify-content: space-around;"><img src="{banner_url}" alt="Banner" style="width: 10vw; max-width: 600px;"> </div>'

INTRODUCTION_TEXT = "📖**Open Universal Arabic ASR Leaderboard**📖 benchmarks multi-dialect Arabic ASR models on various multi-dialect datasets.<br>Apart from the WER%/CER% for each test set, we also report the Average WER%/CER% and rank the models based on the Average WER, from lowest to highest.<br>To reproduce the benchmark numbers and request a model that is not listed, you can launch an issue/PR in our [GitHub repo](https://github.com/Natural-Language-Processing-Elm/open_universal_arabic_asr_leaderboard)😊.<br>For more detailed analysis such as models' robustness, speaker adaption, model efficiency and memory usage, please check our [paper](https://arxiv.org/pdf/2412.13788)."

CITATION_BUTTON_TEXT = """
@article{wang2024open,
  title={Open Universal Arabic ASR Leaderboard},
  author={Wang, Yingzhi and Alhmoud, Anas and Alqurishi, Muhammad},
  journal={arXiv preprint arXiv:2412.13788},
  year={2024}
}
"""

METRICS_TAB_TEXT = METRICS_TAB_TEXT = """
## Metrics
We report both the Word Error Rate (WER) and Character Error Rate (CER).
## Reproduction
The Open Universal Arabic ASR Leaderboard will be a continuous benchmark project. 
\nWe open-source the evaluation scripts at our [GitHub repo](https://github.com/Natural-Language-Processing-Elm/open_universal_arabic_asr_leaderboard).
\nPlease launch a discussion in our GitHub repo to let us know if you want to learn about the performance of a new model.

## Benchmark datasets
| Test Set                                                                                        | Num Dialects   | Test (h)    |
|-------------------------------------------------------------------------------------------------|----------------|-------------|
| [SADA](https://www.kaggle.com/datasets/sdaiancai/sada2022)                                      | 10             | 10.7        |
| [Common Voice 18.0](https://commonvoice.mozilla.org/en/datasets)                                | 25             | 12.6        |
| [MASC (Clean-Test)](https://ieee-dataport.org/open-access/masc-massive-arabic-speech-corpus)    | 7              | 10.5        |
| [MASC (Noisy-Test)](https://ieee-dataport.org/open-access/masc-massive-arabic-speech-corpus)    | 8              | 14.9        |
| [MGB-2](http://www.mgb-challenge.org/MGB-2.html)                                                | Unspecified    | 9.6         |
| [Casablanca](https://huggingface.co/datasets/UBC-NLP/Casablanca)                                | 8              | 7.7         |

## In-depth Analysis
We also provide a comprehensive analysis of the models' robustness, speaker adaptation, inference efficiency and memory consumption.
\nPlease check our [paper](https://arxiv.org/pdf/2412.13788) to learn more.
"""


def styled_message(message):
    return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"

UPDATES = "Nov 13th 2025:[New models included: 8 omnilingual-asr CTC&LLM models<br>Sep 30th 2025:[New models included: Qwen3-Omni-30B-A3B-Instruct<br>Sep 22th 2025:[New models included: Voxtral-mini and Voxtral-Small]<br>Jan 11th 2025:[New models included: Nvidia Parakeet-CTC-XXL-1.1B-Universal and Nvidia Parakeet-CTC-XXL-1.1B-Concat]<br>Jan 11th 2025:[New dataset included: Casablanca]"

results = {
    "Model": ["omnilingual-asr/omniASR_LLM_7B", "omnilingual-asr/omniASR_LLM_3B", "omnilingual-asr/omniASR_LLM_1B", "Qwen/Qwen3-Omni-30B-A3B-Instruct", "nvidia-conformer-ctc-large-arabic (lm)", "omnilingual-asr/omniASR_LLM_300M", "mistralai/Voxtral-Small-24B-2507", "nvidia-conformer-ctc-large-arabic (greedy)", "openai/whisper-large-v3", "omnilingual-asr/omniASR_CTC_3B", "omnilingual-asr/omniASR_CTC_7B", "facebook/seamless-m4t-v2-large", "omnilingual-asr/omniASR_CTC_1B", "openai/whisper-large-v3-turbo", "openai/whisper-large-v2", "openai/whisper-large", "mistralai/Voxtral-Mini-3B-2507", "asafaya/hubert-large-arabic-transcribe", "openai/whisper-medium", "nvidia-Parakeet-ctc-1.1b-concat", "omnilingual-asr/omniASR_CTC_300M", "nvidia-Parakeet-ctc-1.1b-universal", "facebook/mms-1b-all", "openai/whisper-small", "whitefox123/w2v-bert-2.0-arabic-4", "jonatasgrosman/wav2vec2-large-xlsr-53-arabic", "speechbrain/asr-wav2vec2-commonvoice-14-ar"],
    "Average WER⬇️": [28.32, 29.96, 29.96, 30.71, 32.91, 32.96, 34.47, 34.74, 36.86, 37.78, 38.12, 38.16, 39.29, 40.05, 40.20, 42.57, 42.58, 45.50, 45.57, 46.54, 46.65, 51.96, 54.54, 55.13, 58.13, 60.98, 65.74],
    "Average CER": [12.52, 13.77, 13.40, 13.67, 13.84, 14.84, 15.29, 13.37, 17.21, 19.79, 20.91, 17.03, 20.47, 18.87, 19.55, 20.49, 19.90, 17.35, 22.27, 23.88, 21.86, 25.19, 21.45, 21.68, 27.62, 25.61, 30.93],
    "SADA WER": [41.61, 46.18, 43.84, 44.82, 44.52, 51.38, 50.82, 47.26, 55.96, 69.85, 72.69, 62.52, 71.42, 60.36, 57.46, 63.24, 63.65, 67.82, 67.71, 70.70, 78.11, 73.58, 77.48, 78.02, 87.34, 86.82, 88.54],
    "SADA CER": [24.95, 27.27, 24.54, 26.11, 23.76, 29.10, 28.85, 22.54, 34.62, 51.70, 54.95, 37.61, 52.33, 37.67, 36.59, 40.16, 35.89, 31.83, 43.83, 46.70, 52.52, 49.48, 37.50, 33.17, 56.75, 44.20, 50.28],
    "Common Voice\nWER": [8.75, 9.15, 9.55, 11.46, 8.80, 12.03, 15.25, 10.60, 17.83, 14.19, 12.47, 21.70, 17.55, 25.73, 21.77, 26.04, 22.12, 8.01, 28.07, 26.34, 27.90, 40.01, 26.52, 24.18, 41.79, 23.00, 29.17],
    "Common Voice\nCER": [2.71, 2.80, 2.97, 4.28, 2.77, 4.04, 5.54, 3.05, 5.74, 5.74, 5.36, 6.24, 7.97, 10.89, 7.44, 9.61, 8.44, 2.37, 10.38, 9.82, 11.66, 14.64, 7.21, 6.79, 15.75, 6.64, 9.85],
    "MASC(clean-test)\nWER": [19.69, 19.90, 20.03, 21.47, 23.74, 20.66, 23.96, 24.12, 24.66, 21.48, 21.08, 25.04, 22.76, 25.51, 27.25, 28.89, 28.37, 32.94, 29.99, 30.49, 28.40, 36.16, 38.82, 35.93, 37.82, 42.75, 49.10],
    "MASC(clean-test)\nCER": [5.76, 6.13, 6.14, 5.59, 5.63, 6.22, 7.06, 5.63, 7.24, 6.11, 6.22, 7.19, 6.36, 7.55, 8.28, 9.05, 8.73, 7.15, 8.98, 8.41, 7.76, 10.29, 10.36, 9.01, 11.92, 11.87, 16.37],
    "MASC(noisy-test)\nWER": [29.29, 30.03, 30.26, 30.85, 34.29, 32.45, 34.43, 35.64, 34.63, 34.60, 35.04, 33.24, 35.73, 37.16, 38.55, 40.79, 41.27, 50.16, 42.91, 45.95, 43.26, 50.03, 57.33, 56.36, 53.28, 64.27, 69.57],
    "MASC(noisy-test)\nCER": [10.66, 11.27, 11.18, 11.28, 11.07, 12.23, 12.22, 11.02, 12.89, 12.32, 13.57, 11.92, 12.52, 13.93, 15.49, 16.31, 16.44, 15.62, 17.49, 18.72, 14.89, 20.09, 19.76, 19.43, 21.93, 24.17, 30.17],
    "MGB-2 WER": [14.13, 14.22, 15.34, 13.09, 17.20, 16.58, 16.03, 19.69, 16.26, 18.96, 20.43, 20.23, 19.96, 17.75, 25.17, 24.28, 22.56, 37.51, 29.32, 24.94, 26.85, 30.68, 39.16, 48.64, 40.66, 56.29, 64.37],
    "MGB-2 CER": [7.10, 7.06, 7.56, 6.20, 6.87, 7.86, 7.41, 7.46, 7.74, 8.28, 9.78, 9.37, 8.56, 8.34, 13.48, 12.10, 10.46, 11.07, 14.82, 9.87, 10.03, 11.36, 13.48, 15.56, 19.39, 20.44, 26.56],
    "Casablanca\nWER": [56.46, 60.27, 60.68, 62.55, 68.90, 64.64, 66.30, 71.13, 71.81, 67.58, 67.02, 66.25, 68.32, 73.79, 71.01, 72.18, 77.52, 76.53, 75.44, 80.80, 75.35, 81.30, 87.95, 87.64, 87.88, 92.72, 93.68],
    "Casablanca\nCER": [23.96, 28.06, 28.02, 28.53, 32.97, 29.61, 30.64, 30.50, 35.04, 34.59, 35.60, 29.85, 35.08, 34.83, 36.00, 35.71, 39.43, 36.03, 38.12, 49.77, 34.29, 45.31, 40.41, 46.12, 39.99, 46.33, 52.36],
}

original_df = pd.DataFrame(results)
original_df.sort_values(by="Average WER⬇️", inplace=True)

TYPES = ['str', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number']

LEADERBOARD_CSS = """
html, body {
  overflow-y: auto !important;
}

#leaderboard-table th .header-content {
    min-width: 150px;
    white-space: nowrap;
}
"""

def request_model(model_text):
    return styled_message("🤗 Please launch a discussion in our GitHub repo, thank you. 🤗")

with gr.Blocks(fill_width=False, fill_height=False, css=LEADERBOARD_CSS) as demo:
    gr.HTML(BANNER, elem_id="banner")
    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")

    with gr.Tabs(elem_classes="tab-buttons") as tabs:
        with gr.TabItem("🏅 Leaderboard", elem_id="od-benchmark-tab-table", id=0):
            leaderboard_table = gr.Dataframe(
                value=original_df,
                datatype=TYPES,
                elem_id="leaderboard-table",
                interactive=False,
                visible=True,
            )

        with gr.TabItem("📈 Metrics", elem_id="od-benchmark-tab-table", id=1):
            gr.Markdown(METRICS_TAB_TEXT, elem_classes="markdown-text")

        with gr.TabItem("✉️✨ Request a model here!", elem_id="od-benchmark-tab-table", id=2):
            with gr.Column():
                gr.Markdown("# ✉️✨ Request results for a new model here!", elem_classes="markdown-text")
                model_name_textbox = gr.Textbox(label="Model name (user_name/model_name)")
                mdw_submission_result = gr.Markdown()
                btn_submit = gr.Button(value="🚀 Request")
                btn_submit.click(request_model, [model_name_textbox], mdw_submission_result)

    gr.Markdown(UPDATES, elem_classes="markdown-text")

    with gr.Row():
        with gr.Accordion("📙 Citation", open=False):
            gr.Textbox(
                value=CITATION_BUTTON_TEXT, lines=7,
                label="Copy the BibTeX snippet to cite this source",
                elem_id="citation-button",
                show_copy_button=True,
            )

demo.launch(allowed_paths=["banner.png"], ssr_mode=False)