import json from pathlib import Path import pandas as pd import streamlit as st from streamlit_option_menu import option_menu # 兼容不同 streamlit 版本的 image 参数 STREAMLIT_VERSION = tuple(map(int, st.__version__.split(".")[:2])) IMAGE_WIDTH_PARAM = {"width": "stretch"} if STREAMLIT_VERSION >= (1, 35) else {"use_column_width": True} # ============== 配置 ============== PAGE_CONFIG = { "LeaderBoard": {"icon": "trophy", "fn": "render_leaderboard"}, "Introduction": {"icon": "house", "fn": "render_intro"}, "Methodology": {"icon": "book", "fn": "render_methodology"}, "Datasets": {"icon": "database", "fn": "render_datasets"}, "Important Dates": {"icon": "calendar", "fn": "render_dates"}, "Evaluation Metrics": {"icon": "clipboard", "fn": "render_metrics"}, "Submit": {"icon": "upload", "fn": "render_submit"}, "Organisers": {"icon": "people", "fn": "render_organisers"}, "References": {"icon": "book", "fn": "render_references"}, } NAV_STYLES = { "container": {"padding": "5px"}, "icon": {"color": "orange", "font-size": "18px"}, "nav-link": {"font-size": "16px", "text-align": "left", "margin": "0px", "--hover-color": "#6c757d"}, "nav-link-selected": {"background-color": "#FF6347"}, } CUSTOM_CSS = """ """ # ============== 数据加载 ============== @st.cache_data def load_leaderboard_data() -> dict: """从 JSON 文件加载排行榜数据""" data_path = Path(__file__).parent / "data" / "leaderboard.json" with open(data_path, encoding="utf-8") as f: return json.load(f) def format_dataframe(df: pd.DataFrame) -> pd.DataFrame: """格式化数值列为4位小数""" df = df.copy() for col in df.select_dtypes(include=["float64", "int64"]).columns: df[col] = df[col].apply(lambda x: f"{x:.4f}") return df # ============== 页面渲染函数 ============== def render_leaderboard(): """渲染排行榜页面""" # 上下结构:Logo 居中 + 标题栏 logo_col = st.columns([1, 2, 1])[1] # 中间列居中 with logo_col: st.image("asserts/logo.png") st.markdown("""
🏆

Leaderboard

""", unsafe_allow_html=True) # 添加底部间距 st.markdown("
", unsafe_allow_html=True) data = load_leaderboard_data() entries = data["entries"] tasks = data["tasks"] metrics = data["metrics"] task_names = list(tasks.keys()) # Tab icons mapping tab_icons = { "Weighted Overall": "🎯", "Comprehensiveness": "📊", "Insight": "💡", "Instruction Following": "📋", "Readability": "📖", } # 提取基础信息 teams = [e["team"] for e in entries] methods = [e["method"] for e in entries] # 使用 tabs 展示各维度表格 - 添加 icons tab_labels = [f"{tab_icons.get(name, '📌')} {name}" for name in task_names] tab_objects = st.tabs(tab_labels) for tab, task_name in zip(tab_objects, task_names): with tab: dataset = tasks[task_name].get("dataset", "") icon = tab_icons.get(task_name, "📌") title = f"{icon} {task_name}" if dataset: title += f" | Dataset: {dataset}" st.markdown(f"

{title}

", unsafe_allow_html=True) # Build task data with numeric values first for sorting task_data_numeric = {"TeamId": teams, "Methods": methods} for metric in metrics: values = [] for entry in entries: task_result = entry["results"].get(task_name, {}) values.append(task_result.get(metric, 0)) task_data_numeric[metric] = values df_task = pd.DataFrame(task_data_numeric) # Sort by Accuracy descending (or first metric if not found) sort_metric = "Accuracy" if "Accuracy" in metrics else metrics[0] df_task = df_task.sort_values(by=sort_metric, ascending=False).reset_index(drop=True) st.dataframe(format_dataframe(df_task), use_container_width=True, hide_index=True) st.markdown(""" 🔗 To register for AEOLLM task, you can visit the following link and choose our AEOLLM task: [https://research.nii.ac.jp/ntcir/ntcir-19/howto.html](https://research.nii.ac.jp/ntcir/ntcir-19/howto.html). 📃 To submit, refer to the "Submit" section in the left-hand navigation bar.🤗 A baseline example can be found in the [baseline_example](https://drive.google.com/drive/folders/1wmvugIiA5In7SHYOgyO7EKULFET5NWY3) folder. 📝 Refer to other sections in the navigation bar for details on evaluation metrics, datasets, important dates and methodology. 🕒 The Leaderboard will be updated daily around 24:00 Beijing Time. """) def render_intro(): """渲染介绍页面""" st.header("Introduction") st.markdown("""

Building on the success of the NTCIR-18 core task AEOLLM, we propose AEOLLM-2 for NTCIR-19 to further investigate automatic evaluation methods for Large Language Models (LLMs), particularly in long-form text generation scenarios.
💡 In AEOLLM-2, we introduce a new subtask: Deep Research Evaluation. This subtask focuses on the automated evaluation of long-form deep research reports generated by LLMs.
🔍 Participants will be asked to develop evaluation methods that automatically score the quality of the generated reports.
⚖️ The performance of each method will be measured by comparing its scores against human-annotated ground-truth labels.
🚀 We believe that AEOLLM-2 will drive research in robust, scalable, and interpretable long-text evaluation techniques.

""", unsafe_allow_html=True) def render_methodology(): """渲染方法页面""" st.header("Methodology") _, col_center, _ = st.columns([1, 3, 1]) with col_center: st.image("asserts/method.svg", **IMAGE_WIDTH_PARAM) st.markdown("""

First, we choose DeepResearchBench as our primary question source.

Second, we choose a series of popular LLMs during the competition to generate answers.

Third, we manually annotate the answer sets for each question, which will be used as gold standards for evaluating the performance of different evaluation methods.

Last, we will collect evaluation results from participants and calculate consistency with manually annotated results. We will use Accuracy, Kendall's tau and Spearman correlation coefficient as the evaluation metrics.

""", unsafe_allow_html=True) def render_datasets(): """渲染数据集页面""" st.header("Answer Generation") st.markdown(""" - We collected a question set consisting of 40 questions in total, including 29 Chinese questions and 11 English questions, covering a broad range of topics. - For each question, we designed 4 highly similar prompt variants that nevertheless induce substantial differences in answer quality, such as rich vs. shallow and neutral vs. biased responses, making long-form evaluation considerably more challenging. - We then selected 4 models to generate answers, resulting in 640 instances (40 x 4 x 4) in total. """) st.header("Human Annotation") st.markdown(""" - For each instance (question-answer pair), we conducted both objective and subjective annotation. - For the objective evaluation, we considered three dimensions: comprehensiveness, insight, and instruction following. Each dimension was assessed based on a set of fine-grained criteria with corresponding weights, and the weighted scores were aggregated into three final dimension scores. To improve efficiency, we first applied RAG + LLM pre-scoring, followed by human verification. - For the subjective evaluation, we considered readability. This dimension was also assessed using a set of fine-grained criteria with corresponding weights, which were aggregated into one final subjective score. The subjective score was assigned by two human annotators on a 10-point scale; if their scores were inconsistent, a third annotator was introduced for adjudication. """) st.header("Data Acquisition and Usage") st.markdown(""" We divided the 640 instances into three parts: 1️⃣ train set: 25% of the data was designated as the training set (including human annotations) for participants to reference when designing their methods. 2️⃣ test set: Another 25% of the data was set aside as the test set (excluding human annotations), used to evaluate the performance of participants' methods and to generate the **leaderboard**. 3️⃣ reserved set: The remaining 50% of the data was reserved for **the final evaluation**. Both the training set and the test set can be downloaded from the provided link: [https://huggingface.co/datasets/THUIR/AEOLLM](https://huggingface.co/datasets/THUIR/AEOLLM). """) def render_dates(): """渲染重要日期页面""" st.header("Important Dates") st.markdown( """

All deadlines are at 11:59pm in the Anywhere on Earth (AOE) timezone.

""", unsafe_allow_html=True ) col1, col2 = st.columns(2) with col1: st.markdown(""" Kickoff Event:
Dataset Release:
Dry Run Deadline:
Formal Run:
Evaluation Results Return:
Task Overview Release (Draft):
Submission Due of Participant Papers (Draft):
Camera-Ready Participant Paper Due:
NTCIR-19 Conference:
""", unsafe_allow_html=True) with col2: st.markdown(""" Sep 3, 2025
March 15, 2026
July 15, 2026
July 15, 2026 - August 1, 2026
August 1, 2026
August 1, 2026
September 1, 2026
November 1, 2026
December 8-10, 2026
""", unsafe_allow_html=True) st.markdown("""

During the Dry Run (until July 15, 2026), we will use the test set (https://huggingface.co/datasets/THUIR/AEOLLM) to evaluate the performance of participants' methods and release the results on the Leaderboard.

Before the Formal Run begins (before July 15, 2026), we will release the reserved set. Participants need to submit their results for the reserved set before the Formal Run ends (before August 1, 2026).

The schedule may be adjusted according to actual circumstances.

""", unsafe_allow_html=True) def render_metrics(): """渲染评估指标页面""" st.header("Evaluation Metrics") st.markdown(""" - **Acc(Accuracy):** The proportion of identical preference results between the model and human annotations. Specifically, we first convert individual scores into pairwise preferences and then calculate consistency with human annotations. - **Kendall's tau:** Measures the ordinal association between two ranked variables. $$ \\tau=\\frac{C-D}{\\frac{1}{2}n(n-1)} $$ where: - $C$ is the number of concordant pairs, - $D$ is the number of discordant pairs, - $n$ is the number of pairs. - **Spearman's Rank Correlation Coefficient:** Measures the strength and direction of the association between two ranked variables. $$ \\rho = 1 - \\frac{6 \\sum d_i^2}{n(n^2 - 1)} $$ where: - $d_i$ is the difference between the ranks of corresponding elements in the two lists, - $n$ is the number of elements. """, unsafe_allow_html=True) def render_submit(): """渲染提交页面""" st.header("File Format") st.markdown(""" We will be following a similar format as the ones used by most **TREC submissions**: 1. White space is used to separate columns. 2. The width of the columns in the format is not important, but it is important to have exactly seven columns per line with at least one space between the columns. **taskId questionId answerId comprehensiveness_score insight_score instruction_following_score readability_score** - the first column is the taskeId (index different tasks) - the second column is questionId (index different questions in the same task) - the third column is answerId (index the answer provided by different LLMs to the same question) - the fourth column is comprehensiveness_score (index the comprehensiveness score) - the fifth column is insight_score (index the insight score) - the sixth column is instruction_following_score (index the instruction following score) - the seventh column is readability_score (index the readability score) """) st.header("Submit") st.markdown(""" 📄 Please organize the answers in a **txt** file, name the file as **teamId_methods.txt** and submit it through the link below: Dry run: [https://forms.gle/P13jG4Bvyi14uRvr5](https://forms.gle/P13jG4Bvyi14uRvr5) ⏱️ Each team can submit up to 5 times per day, and only the latest submission will be considered. 🔗 An example of the submission file content is [here](https://drive.google.com/file/d/1vkH5r9zlgx74xoiS2tTkn-jy0uZSXKJL/view?usp=drive_link). """) def render_organisers(): """渲染组织者页面""" st.header("Organisers") st.markdown("""

Junjie Chen [chenjj826@gmail.com] (Tsinghua University)
Haitao Li [liht22@mails.tsinghua.edu.cn] (Tsinghua University)
Yiqun Liu (Tsinghua University)
Qingyao Ai [aiqy@tsinghua.edu.cn] (Tsinghua University)
Please feel free to contact us! 😉

""", unsafe_allow_html=True) st.image("asserts/organizer.png") def render_references(): """渲染参考文献页面""" st.header("References") st.markdown(""" [1] Mao R, Chen G, Zhang X, et al. GPTEval: A survey on assessments of ChatGPT and GPT-4. pdf
[2] Chang Y, Wang X, Wang J, et al. A survey on evaluation of large language models. pdf
[3] Chan C M, Chen W, Su Y, et al. Chateval: Towards better llm-based evaluators through multi-agent debate. pdf
[4] Li R, Patel T, Du X. Prd: Peer rank and discussion improve large language model based evaluations. pdf
[5] Chu Z, Ai Q, Tu Y, et al. Pre: A peer review based large language model evaluator. pdf
[6] Du M, Xu B, Zhu C, et al. DeepResearch Bench: A Comprehensive Benchmark for Deep Research Agents. pdf

""", unsafe_allow_html=True) # ============== 主程序 ============== def main(): """主函数""" st.set_page_config(page_title="AEOLLM 2", page_icon="👋", layout="wide") st.title("NTCIR-19 Automatic Evaluation of LLMs (AEOLLM) 2") # 侧边栏导航 with st.sidebar: page = option_menu( "Navigation", list(PAGE_CONFIG.keys()), icons=[v["icon"] for v in PAGE_CONFIG.values()], menu_icon="cast", default_index=0, styles=NAV_STYLES, ) # 加载自定义样式 st.markdown(CUSTOM_CSS, unsafe_allow_html=True) # 路由到对应页面 page_fn = globals()[PAGE_CONFIG[page]["fn"]] page_fn() if __name__ == "__main__": main()