import json from pathlib import Path import pandas as pd import streamlit as st from streamlit_option_menu import option_menu # 兼容不同 streamlit 版本的 image 参数 STREAMLIT_VERSION = tuple(map(int, st.__version__.split(".")[:2])) IMAGE_WIDTH_PARAM = {"width": "stretch"} if STREAMLIT_VERSION >= (1, 35) else {"use_column_width": True} # ============== 配置 ============== PAGE_CONFIG = { "LeaderBoard": {"icon": "trophy", "fn": "render_leaderboard"}, "Introduction": {"icon": "house", "fn": "render_intro"}, "Methodology": {"icon": "book", "fn": "render_methodology"}, "Datasets": {"icon": "database", "fn": "render_datasets"}, "Important Dates": {"icon": "calendar", "fn": "render_dates"}, "Evaluation Metrics": {"icon": "clipboard", "fn": "render_metrics"}, "Submit": {"icon": "upload", "fn": "render_submit"}, "Organisers": {"icon": "people", "fn": "render_organisers"}, "References": {"icon": "book", "fn": "render_references"}, } NAV_STYLES = { "container": {"padding": "5px"}, "icon": {"color": "orange", "font-size": "18px"}, "nav-link": {"font-size": "16px", "text-align": "left", "margin": "0px", "--hover-color": "#6c757d"}, "nav-link-selected": {"background-color": "#FF6347"}, } CUSTOM_CSS = """ """ # ============== 数据加载 ============== @st.cache_data def load_leaderboard_data() -> dict: """从 JSON 文件加载排行榜数据""" data_path = Path(__file__).parent / "data" / "leaderboard.json" with open(data_path, encoding="utf-8") as f: return json.load(f) def format_dataframe(df: pd.DataFrame) -> pd.DataFrame: """格式化数值列为4位小数""" df = df.copy() for col in df.select_dtypes(include=["float64", "int64"]).columns: df[col] = df[col].apply(lambda x: f"{x:.4f}") return df # ============== 页面渲染函数 ============== def render_leaderboard(): """渲染排行榜页面""" # 上下结构:Logo 居中 + 标题栏 logo_col = st.columns([1, 2, 1])[1] # 中间列居中 with logo_col: st.image("asserts/logo.png") st.markdown("""
{title}
", unsafe_allow_html=True) # Build task data with numeric values first for sorting task_data_numeric = {"TeamId": teams, "Methods": methods} for metric in metrics: values = [] for entry in entries: task_result = entry["results"].get(task_name, {}) values.append(task_result.get(metric, 0)) task_data_numeric[metric] = values df_task = pd.DataFrame(task_data_numeric) # Sort by Accuracy descending (or first metric if not found) sort_metric = "Accuracy" if "Accuracy" in metrics else metrics[0] df_task = df_task.sort_values(by=sort_metric, ascending=False).reset_index(drop=True) st.dataframe(format_dataframe(df_task), use_container_width=True, hide_index=True) st.markdown(""" 🔗 To register for AEOLLM task, you can visit the following link and choose our AEOLLM task: [https://research.nii.ac.jp/ntcir/ntcir-19/howto.html](https://research.nii.ac.jp/ntcir/ntcir-19/howto.html). 📃 To submit, refer to the "Submit" section in the left-hand navigation bar.🤗 A baseline example can be found in the [baseline_example](https://drive.google.com/drive/folders/1wmvugIiA5In7SHYOgyO7EKULFET5NWY3) folder. 📝 Refer to other sections in the navigation bar for details on evaluation metrics, datasets, important dates and methodology. 🕒 The Leaderboard will be updated daily around 24:00 Beijing Time. """) def render_intro(): """渲染介绍页面""" st.header("Introduction") st.markdown("""Building on the success of the NTCIR-18 core task AEOLLM, we propose AEOLLM-2 for NTCIR-19 to further investigate automatic evaluation methods for Large Language Models (LLMs), particularly in long-form text generation scenarios.
💡 In AEOLLM-2, we introduce a new subtask: Deep Research Evaluation. This subtask focuses on the automated evaluation of long-form deep research reports generated by LLMs.
🔍 Participants will be asked to develop evaluation methods that automatically score the quality of the generated reports.
⚖️ The performance of each method will be measured by comparing its scores against human-annotated ground-truth labels.
🚀 We believe that AEOLLM-2 will drive research in robust, scalable, and interpretable long-text evaluation techniques.
First, we choose DeepResearchBench as our primary question source.
Second, we choose a series of popular LLMs during the competition to generate answers.
Third, we manually annotate the answer sets for each question, which will be used as gold standards for evaluating the performance of different evaluation methods.
Last, we will collect evaluation results from participants and calculate consistency with manually annotated results. We will use Accuracy, Kendall's tau and Spearman correlation coefficient as the evaluation metrics.
""", unsafe_allow_html=True) def render_datasets(): """渲染数据集页面""" st.header("Answer Generation") st.markdown(""" - We collected a question set consisting of 40 questions in total, including 29 Chinese questions and 11 English questions, covering a broad range of topics. - For each question, we designed 4 highly similar prompt variants that nevertheless induce substantial differences in answer quality, such as rich vs. shallow and neutral vs. biased responses, making long-form evaluation considerably more challenging. - We then selected 4 models to generate answers, resulting in 640 instances (40 x 4 x 4) in total. """) st.header("Human Annotation") st.markdown(""" - For each instance (question-answer pair), we conducted both objective and subjective annotation. - For the objective evaluation, we considered three dimensions: comprehensiveness, insight, and instruction following. Each dimension was assessed based on a set of fine-grained criteria with corresponding weights, and the weighted scores were aggregated into three final dimension scores. To improve efficiency, we first applied RAG + LLM pre-scoring, followed by human verification. - For the subjective evaluation, we considered readability. This dimension was also assessed using a set of fine-grained criteria with corresponding weights, which were aggregated into one final subjective score. The subjective score was assigned by two human annotators on a 10-point scale; if their scores were inconsistent, a third annotator was introduced for adjudication. """) st.header("Data Acquisition and Usage") st.markdown(""" We divided the 640 instances into three parts: 1️⃣ train set: 25% of the data was designated as the training set (including human annotations) for participants to reference when designing their methods. 2️⃣ test set: Another 25% of the data was set aside as the test set (excluding human annotations), used to evaluate the performance of participants' methods and to generate the **leaderboard**. 3️⃣ reserved set: The remaining 50% of the data was reserved for **the final evaluation**. Both the training set and the test set can be downloaded from the provided link: [https://huggingface.co/datasets/THUIR/AEOLLM](https://huggingface.co/datasets/THUIR/AEOLLM). """) def render_dates(): """渲染重要日期页面""" st.header("Important Dates") st.markdown( """All deadlines are at 11:59pm in the Anywhere on Earth (AOE) timezone.
""", unsafe_allow_html=True ) col1, col2 = st.columns(2) with col1: st.markdown(""" Kickoff Event:
During the Dry Run (until July 15, 2026), we will use the test set (https://huggingface.co/datasets/THUIR/AEOLLM)
to evaluate the performance of participants' methods and release the results on the Leaderboard.
Before the Formal Run begins (before July 15, 2026), we will release the reserved set.
Participants need to submit their results for the reserved set before the Formal Run ends (before August 1, 2026).
The schedule may be adjusted according to actual circumstances.
Junjie Chen [chenjj826@gmail.com] (Tsinghua University)
Haitao Li [liht22@mails.tsinghua.edu.cn] (Tsinghua University)
Yiqun Liu (Tsinghua University)
Qingyao Ai [aiqy@tsinghua.edu.cn] (Tsinghua University)
Please feel free to contact us! 😉