import gradio as gr import sys import os from RadEval import RadEval def run_radeval_simple(ref_text, hyp_text, selected_metrics): """ Run RadEval with selected metrics on a pair of reference and hypothesis texts """ try: refs = [ref_text.strip()] hyps = [hyp_text.strip()] # Configure RadEval based on selected metrics config = { 'do_radgraph': 'RadGraph F1' in selected_metrics, 'do_bleu': 'BLEU' in selected_metrics, 'do_rouge': 'ROUGE' in selected_metrics, 'do_bertscore': 'BERTScore' in selected_metrics, 'do_chexbert': 'CheXbert F1' in selected_metrics, 'do_ratescore': 'RaTEScore' in selected_metrics, 'do_radcliq': 'RadCliQ' in selected_metrics, 'do_temporal': 'Temporal F1' in selected_metrics, 'do_radeval_bertsore': 'RadEval BERTScore' in selected_metrics, 'do_green': 'GREEN' in selected_metrics, 'do_srr_bert': 'SRR-BERT' in selected_metrics } # Initialize RadEval with selected metrics evaluator = RadEval(**config) # Run evaluation results = evaluator(refs=refs, hyps=hyps) # Prepare results for display table_data = [] analysis_text = "## RadEval Results\n\n" analysis_text += f"**Reference:** {ref_text[:100]}{'...' if len(ref_text) > 100 else ''}\n\n" analysis_text += f"**Hypothesis:** {hyp_text[:100]}{'...' if len(hyp_text) > 100 else ''}\n\n" analysis_text += "### Evaluation Scores:\n\n" for metric, score in results.items(): if isinstance(score, (int, float)): formatted_score = f"{score:.4f}" if isinstance(score, float) else str(score) table_data.append([metric, formatted_score]) analysis_text += f"- **{metric}**: {formatted_score}\n" elif isinstance(score, dict): # Handle nested metrics for sub_metric, sub_score in score.items(): if isinstance(sub_score, (int, float)): formatted_score = f"{sub_score:.4f}" if isinstance(sub_score, float) else str(sub_score) metric_name = f"{metric}_{sub_metric}" table_data.append([metric_name, formatted_score]) analysis_text += f"- **{metric_name}**: {formatted_score}\n" if not table_data: return "No metrics were computed. Please select at least one metric.", [["No results", ""]] return analysis_text, table_data except ImportError as e: error_msg = f"Import Error: {str(e)}. Please ensure RadEval dependencies are installed." return error_msg, [["Error", error_msg]] except Exception as e: error_msg = f"Evaluation Error: {str(e)}" return error_msg, [["Error", error_msg]] # Example pairs for radiology reports examples = { "Normal vs Normal": { "ref": "Heart size is normal. Lungs are clear. No pleural effusion or pneumothorax.", "hyp": "Cardiac silhouette is within normal limits. Lungs are clear bilaterally. No effusion or pneumothorax identified.", }, "Pneumonia Case": { "ref": "Moderate cardiomegaly. Bilateral lower lobe consolidations consistent with pneumonia.", "hyp": "Enlarged heart. Worsening bilateral infiltrates in the lower lobes suggestive of pneumonia.", }, "Temporal Comparison": { "ref": "Compared to prior study, the pleural effusion has increased in size. New bilateral infiltrates are present.", "hyp": "The pleural effusion is larger than on the previous examination. There are new bilateral pulmonary infiltrates.", }, "Discordant Reports": { "ref": "No acute cardiopulmonary process. Normal heart size and lung fields.", "hyp": "Mild cardiomegaly with bilateral lower lobe atelectasis. Small pleural effusion on the right.", }, "Ambiguous Language": { "ref": "There is a small left-sided pleural effusion with adjacent atelectasis.", "hyp": "Possible small effusion on the left. Atelectasis cannot be excluded.", }, "Surgical Follow-up": { "ref": "Status post coronary artery bypass grafting. No evidence of acute complication.", "hyp": "Post-operative changes from CABG are present. No signs of surgical complication.", }, "False Positive": { "ref": "No focal consolidation, pleural effusion, or pneumothorax identified.", "hyp": "Right lower lobe consolidation concerning for pneumonia.", }, "Textual Hallucination": { "ref": "Heart and mediastinum are normal. Lungs are clear.", "hyp": "Large left pleural effusion with mediastinal shift to the right.", }, "Negation Challenge": { "ref": "No evidence of pneumothorax or pleural effusion.", "hyp": "Evidence of small pneumothorax on the right.", }, "Fine-grained Difference": { "ref": "Mild interstitial markings at the lung bases, likely chronic.", "hyp": "Subtle increased interstitial opacities at both lung bases, likely chronic in nature.", } } def update_fields(choice): """Update text fields based on example selection""" if choice == "Custom": return gr.update(value="", interactive=True), gr.update(value="", interactive=True) else: return ( gr.update(value=examples[choice]["ref"], interactive=False), gr.update(value=examples[choice]["hyp"], interactive=False) ) # Available metrics (ordered by computational complexity) available_metrics = [ "BLEU", "ROUGE", "BERTScore", "Temporal F1", "RadEval BERTScore", "RaTEScore", "RadCliQ", "SRR-BERT", "CheXbert F1", "RadGraph F1", "GREEN" ] # Fast metrics for default selection default_metrics = ["BLEU", "ROUGE", "BERTScore"] with gr.Blocks(title="RadEval: A framework for radiology text evaluation", theme=gr.themes.Soft()) as demo: gr.Markdown( """ # 🩺 RadEval: A framework for radiology text evaluation [Github](https://pypi.org/project/RadEval/) | [PyPI]() | [Video](https://justin13601.github.io/files/radeval.mp4) |[arXiv]() | [RadEvalModernBERT Model](https://huggingface.co/IAMJB/RadEvalModernBERT) | [Expert Dataset]() **RadEval** is a lightweight, extensible framework for **evaluating radiology reports** using both standard NLP metrics (e.g. BLEU, ROUGE, BERTScore) and **radiology-specific measures** (e.g. RadGraph, CheXbert, GREEN). Whether you're benchmarking generation systems or validating clinical correctness, RadEval offers **comprehensive and interpretable** metrics out of the box. **⚠️ Performance Warning ⚠️** The demo is currently running on **CPU**. When using some slower metrics (like RadGraph, CheXbert, GREEN), it may take a while to complete evaluation. Please be patient. """ ) with gr.Row(): choice = gr.Radio( label="📋 Choose Example or Custom Input", choices=["Custom"] + list(examples.keys()), value="Custom", interactive=True ) with gr.Row(): with gr.Column(scale=1): ref_input = gr.Textbox( label="📄 Reference Report (Ground Truth)", lines=5, placeholder="Enter the reference radiology report here...", info="The ground truth or expert-written report" ) with gr.Column(scale=1): hyp_input = gr.Textbox( label="🤖 Hypothesis Report (Generated)", lines=5, placeholder="Enter the generated/predicted radiology report here...", info="The AI-generated or system-produced report" ) choice.change( update_fields, inputs=choice, outputs=[ref_input, hyp_input], ) with gr.Row(): metrics_selection = gr.CheckboxGroup( label="🎯 Select Evaluation Metrics", choices=available_metrics, value=default_metrics, interactive=True, info="Select metrics to compute. Some metrics may take longer (RadGraph, CheXbert, GREEN)." ) with gr.Row(): run_button = gr.Button("🚀 Run RadEval", variant="primary", size="lg") with gr.Row(): with gr.Column(scale=2): analysis_output = gr.Markdown( value="📊 **Results will appear here after evaluation...**\n\nSelect your texts and metrics, then click 'Run RadEval'." ) with gr.Column(scale=1): table_output = gr.DataFrame( label="📈 Detailed Scores", headers=["Metric", "Score"], wrap=True ) # Information section with gr.Accordion("💡 Metric Information", open=False): gr.Markdown( """ ### 📊 Available Metrics: **Traditional NLG Metrics:** - **BLEU**: N-gram overlap between reference and hypothesis - **ROUGE**: Recall-oriented overlap (ROUGE-1, ROUGE-2, ROUGE-L) - **BERTScore**: Semantic similarity using BERT embeddings **Radiology-Specific Metrics:** - **RadGraph F1**: Entity and relation extraction for radiology - **CheXbert F1**: Chest X-ray finding classification performance - **RaTEScore**: Radiology-aware text evaluation score - **RadCliQ**: Composite metric for radiology reports - **Temporal F1**: Temporal entity and relationship evaluation - **RadEval BERTScore**: Specialized BERT for radiology text - **GREEN**: Generative evaluation with natural language explanations - **SRR-BERT**: Structured radiology reasoning evaluation ### ⚡ Performance Notes: - **Fast**: BLEU, ROUGE, BERTScore, Temporal F1 - **Medium**: RadEval BERTScore, RaTEScore, RadCliQ, SRR-BERT - **Slow**: CheXbert F1, RadGraph F1, GREEN (requires model downloads) """ ) run_button.click( run_radeval_simple, inputs=[ref_input, hyp_input, metrics_selection], outputs=[analysis_output, table_output] ) if __name__ == "__main__": demo.launch()