Spaces:

MCP-1st-Birthday
/

MedLLM-Agent

Running on Zero

App Files Files Community

MedLLM-Agent / app.py

LiamKhoaLe

Use GPU dynamically

ec4d4b3 about 1 month ago

raw

history blame

86.3 kB

	import gradio as gr
	import os
	import base64
	import logging
	import torch
	import threading
	import time
	import json
	from transformers import (
	AutoModelForCausalLM,
	AutoTokenizer,
	TextIteratorStreamer,
	StoppingCriteria,
	StoppingCriteriaList,
	)
	from transformers import logging as hf_logging
	import spaces
	from llama_index.core import (
	StorageContext,
	VectorStoreIndex,
	load_index_from_storage,
	Document as LlamaDocument,
	)
	from llama_index.core import Settings
	from llama_index.core.node_parser import (
	HierarchicalNodeParser,
	get_leaf_nodes,
	get_root_nodes,
	)
	from llama_index.core.retrievers import AutoMergingRetriever
	from llama_index.core.storage.docstore import SimpleDocumentStore
	from llama_index.llms.huggingface import HuggingFaceLLM
	from llama_index.embeddings.huggingface import HuggingFaceEmbedding
	# Import GPU-tagged model functions
	from model import (
	get_llm_for_rag as get_llm_for_rag_gpu,
	get_embedding_model as get_embedding_model_gpu,
	generate_with_medswin,
	initialize_medical_model,
	global_medical_models,
	global_medical_tokenizers
	)
	from tqdm import tqdm
	from langdetect import detect, LangDetectException
	# MCP imports
	try:
	from mcp import ClientSession, StdioServerParameters
	from mcp.client.stdio import stdio_client
	import asyncio
	try:
	import nest_asyncio
	nest_asyncio.apply() # Allow nested event loops
	except ImportError:
	pass # nest_asyncio is optional
	MCP_AVAILABLE = True
	except ImportError:
	MCP_AVAILABLE = False
	# Fallback imports if MCP is not available
	from ddgs import DDGS
	import requests
	from bs4 import BeautifulSoup
	try:
	from TTS.api import TTS
	TTS_AVAILABLE = True
	except ImportError:
	TTS_AVAILABLE = False
	TTS = None
	import numpy as np
	import soundfile as sf
	import tempfile

	os.environ["TOKENIZERS_PARALLELISM"] = "false"
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)
	hf_logging.set_verbosity_error()

	# Model configurations
	MEDSWIN_MODELS = {
	"MedSwin SFT": "MedSwin/MedSwin-7B-SFT",
	"MedSwin KD": "MedSwin/MedSwin-7B-KD",
	"MedSwin TA": "MedSwin/MedSwin-Merged-TA-SFT-0.7"
	}
	DEFAULT_MEDICAL_MODEL = "MedSwin TA"
	EMBEDDING_MODEL = "abhinand/MedEmbed-large-v0.1" # Domain-tuned medical embedding model
	TTS_MODEL = "maya-research/maya1"
	HF_TOKEN = os.environ.get("HF_TOKEN")
	if not HF_TOKEN:
	raise ValueError("HF_TOKEN not found in environment variables")

	# Gemini MCP configuration
	GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
	GEMINI_MODEL = os.environ.get("GEMINI_MODEL", "gemini-2.5-flash") # Default for harder tasks
	GEMINI_MODEL_LITE = os.environ.get("GEMINI_MODEL_LITE", "gemini-2.5-flash-lite") # For parsing and simple tasks

	# Custom UI
	TITLE = "<h1><center>🩺 MedLLM Agent - Medical RAG & Web Search System</center></h1>"
	DESCRIPTION = """
	<center>
	<p><strong>Advanced Medical AI Assistant</strong> powered by MedSwin models</p>
	<p>📄 <strong>Document RAG:</strong> Answer based on uploaded medical documents</p>
	<p>🌐 <strong>Web Search:</strong> Fetch knowledge from reliable online medical resources</p>
	<p>🌍 <strong>Multi-language:</strong> Automatic translation for non-English queries</p>
	<p>Upload PDF or text files to get started!</p>
	</center>
	"""
	CSS = """
	.upload-section {
	max-width: 400px;
	margin: 0 auto;
	padding: 10px;
	border: 2px dashed #ccc;
	border-radius: 10px;
	}
	.upload-button {
	background: #34c759 !important;
	color: white !important;
	border-radius: 25px !important;
	}
	.chatbot-container {
	margin-top: 20px;
	}
	.status-output {
	margin-top: 10px;
	font-size: 14px;
	}
	.processing-info {
	margin-top: 5px;
	font-size: 12px;
	color: #666;
	}
	.info-container {
	margin-top: 10px;
	padding: 10px;
	border-radius: 5px;
	}
	.file-list {
	margin-top: 0;
	max-height: 200px;
	overflow-y: auto;
	padding: 5px;
	border: 1px solid #eee;
	border-radius: 5px;
	}
	.stats-box {
	margin-top: 10px;
	padding: 10px;
	border-radius: 5px;
	font-size: 12px;
	}
	.submit-btn {
	background: #1a73e8 !important;
	color: white !important;
	border-radius: 25px !important;
	margin-left: 10px;
	padding: 5px 10px;
	font-size: 16px;
	}
	.input-row {
	display: flex;
	align-items: center;
	}
	.recording-timer {
	font-size: 12px;
	color: #666;
	text-align: center;
	margin-top: 5px;
	}
	.feature-badge {
	display: inline-block;
	padding: 3px 8px;
	margin: 2px;
	border-radius: 12px;
	font-size: 11px;
	font-weight: bold;
	}
	.badge-rag {
	background: #e3f2fd;
	color: #1976d2;
	}
	.badge-web {
	background: #f3e5f5;
	color: #7b1fa2;
	}
	@media (min-width: 768px) {
	.main-container {
	display: flex;
	justify-content: space-between;
	gap: 20px;
	}
	.upload-section {
	flex: 1;
	max-width: 300px;
	}
	.chatbot-container {
	flex: 2;
	margin-top: 0;
	}
	}
	"""

	# Global model storage - models are stored in model.py
	# Import the global model storage from model.py
	global_file_info = {}
	global_tts_model = None

	# MCP client storage
	global_mcp_session = None
	global_mcp_stdio_ctx = None # Store stdio context to keep it alive
	global_mcp_lock = threading.Lock() # Lock for thread-safe session access
	# MCP server configuration via environment variables
	# Gemini MCP server: Python-based server (agent.py)
	# This works on Hugging Face Spaces without requiring npm/Node.js
	# Make sure GEMINI_API_KEY is set in environment variables
	#
	# Default configuration uses the bundled agent.py script
	# To override:
	# export MCP_SERVER_COMMAND="python"
	# export MCP_SERVER_ARGS="/path/to/agent.py"
	script_dir = os.path.dirname(os.path.abspath(__file__))
	agent_path = os.path.join(script_dir, "agent.py")
	MCP_SERVER_COMMAND = os.environ.get("MCP_SERVER_COMMAND", "python")
	MCP_SERVER_ARGS = os.environ.get("MCP_SERVER_ARGS", agent_path).split() if os.environ.get("MCP_SERVER_ARGS") else [agent_path]

	async def get_mcp_session():
	"""Get or create MCP client session with proper context management"""
	global global_mcp_session, global_mcp_stdio_ctx

	if not MCP_AVAILABLE:
	return None

	# Check if session exists and is still valid
	if global_mcp_session is not None:
	try:
	# Test if session is still alive by listing tools
	await global_mcp_session.list_tools()
	return global_mcp_session
	except Exception as e:
	logger.debug(f"Existing MCP session invalid, recreating: {e}")
	# Clean up old session
	try:
	if global_mcp_session is not None:
	await global_mcp_session.__aexit__(None, None, None)
	except:
	pass
	try:
	if global_mcp_stdio_ctx is not None:
	await global_mcp_stdio_ctx.__aexit__(None, None, None)
	except:
	pass
	global_mcp_session = None
	global_mcp_stdio_ctx = None

	# Create new session using correct MCP SDK pattern
	try:
	# Prepare environment variables for MCP server
	mcp_env = os.environ.copy()
	if GEMINI_API_KEY:
	mcp_env["GEMINI_API_KEY"] = GEMINI_API_KEY
	else:
	logger.warning("GEMINI_API_KEY not set in environment. Gemini MCP features may not work.")

	# Add other Gemini MCP configuration if set
	if os.environ.get("GEMINI_MODEL"):
	mcp_env["GEMINI_MODEL"] = os.environ.get("GEMINI_MODEL")
	if os.environ.get("GEMINI_TIMEOUT"):
	mcp_env["GEMINI_TIMEOUT"] = os.environ.get("GEMINI_TIMEOUT")
	if os.environ.get("GEMINI_MAX_OUTPUT_TOKENS"):
	mcp_env["GEMINI_MAX_OUTPUT_TOKENS"] = os.environ.get("GEMINI_MAX_OUTPUT_TOKENS")
	if os.environ.get("GEMINI_TEMPERATURE"):
	mcp_env["GEMINI_TEMPERATURE"] = os.environ.get("GEMINI_TEMPERATURE")

	logger.info(f"Creating MCP client session with command: {MCP_SERVER_COMMAND} {MCP_SERVER_ARGS}")
	server_params = StdioServerParameters(
	command=MCP_SERVER_COMMAND,
	args=MCP_SERVER_ARGS,
	env=mcp_env
	)

	# Correct MCP SDK usage: stdio_client is an async context manager
	# that yields (read, write) streams
	stdio_ctx = stdio_client(server_params)
	read, write = await stdio_ctx.__aenter__()

	# Create ClientSession from the streams
	# The __aenter__() method automatically handles the initialization handshake
	session = ClientSession(read, write)

	# Wait longer for the server process to fully start
	# The server needs time to: start Python, import modules, initialize Gemini client, start MCP server
	logger.info("⏳ Waiting for MCP server process to start...")
	await asyncio.sleep(3.0) # Increased wait for server process startup

	try:
	# Initialize the session (this sends initialize request and waits for response)
	logger.info("🔄 Initializing MCP session...")
	await session.__aenter__()
	logger.info("✅ MCP session initialized, verifying tools...")
	except Exception as e:
	logger.warning(f"MCP session initialization had an issue (may be expected): {e}")
	# Continue anyway - the session might still work

	# Wait longer for the server to be fully ready after initialization
	# The server needs time to process the initialization and be ready for requests
	await asyncio.sleep(2.0) # Wait after initialization

	# Verify the session works by listing tools with retries
	# This confirms the server is ready to handle requests
	max_init_retries = 15
	tools_listed = False
	tools = None
	last_error = None
	for init_attempt in range(max_init_retries):
	try:
	tools = await session.list_tools()
	if tools and hasattr(tools, 'tools') and len(tools.tools) > 0:
	logger.info(f"✅ MCP server ready with {len(tools.tools)} tools: {[t.name for t in tools.tools]}")
	tools_listed = True
	break
	except Exception as e:
	last_error = e
	error_str = str(e).lower()
	error_msg = str(e)

	# Log the actual error for debugging
	if init_attempt == 0:
	logger.debug(f"First list_tools attempt failed: {error_msg}")

	# Ignore initialization-related errors during the handshake phase
	if "initialization" in error_str or "before initialization" in error_str or "not initialized" in error_str:
	if init_attempt < max_init_retries - 1:
	wait_time = 0.5 * (init_attempt + 1) # Progressive wait: 0.5s, 1s, 1.5s...
	logger.debug(f"Server still initializing (attempt {init_attempt + 1}/{max_init_retries}), waiting {wait_time}s...")
	await asyncio.sleep(wait_time)
	continue
	elif "invalid request" in error_str or "invalid request parameters" in error_str:
	# This might be a timing issue - wait and retry
	if init_attempt < max_init_retries - 1:
	wait_time = 0.8 * (init_attempt + 1) # Longer wait for invalid request errors
	logger.debug(f"Invalid request error (attempt {init_attempt + 1}/{max_init_retries}), waiting {wait_time}s...")
	await asyncio.sleep(wait_time)
	continue
	elif init_attempt < max_init_retries - 1:
	wait_time = 0.5 * (init_attempt + 1)
	logger.debug(f"Tool listing attempt {init_attempt + 1}/{max_init_retries} failed: {error_msg}, waiting {wait_time}s...")
	await asyncio.sleep(wait_time)
	else:
	logger.error(f"❌ Could not list tools after {max_init_retries} attempts. Last error: {error_msg}")
	# Don't continue - if we can't list tools, the session is not usable
	try:
	await session.__aexit__(None, None, None)
	except:
	pass
	try:
	await stdio_ctx.__aexit__(None, None, None)
	except:
	pass
	return None

	if not tools_listed:
	error_msg = str(last_error) if last_error else "Unknown error"
	logger.error(f"MCP server failed to initialize - tools could not be listed. Last error: {error_msg}")
	try:
	await session.__aexit__(None, None, None)
	except:
	pass
	try:
	await stdio_ctx.__aexit__(None, None, None)
	except:
	pass
	return None

	# Store both the session and stdio context to keep them alive
	global_mcp_session = session
	global_mcp_stdio_ctx = stdio_ctx
	logger.info("MCP client session created successfully")
	return session
	except Exception as e:
	logger.error(f"Failed to create MCP client session: {e}")
	import traceback
	logger.debug(traceback.format_exc())
	global_mcp_session = None
	global_mcp_stdio_ctx = None
	return None

	async def call_agent(user_prompt: str, system_prompt: str = None, files: list = None, model: str = None, temperature: float = 0.2) -> str:
	"""Call Gemini MCP generate_content tool"""
	if not MCP_AVAILABLE:
	logger.warning("MCP not available for Gemini call")
	return ""

	try:
	session = await get_mcp_session()
	if session is None:
	logger.warning("Failed to get MCP session for Gemini call")
	return ""

	# Retry listing tools if it fails the first time
	# Use more retries and longer waits since MCP server might need time
	max_retries = 5
	tools = None
	for attempt in range(max_retries):
	try:
	tools = await session.list_tools()
	if tools and hasattr(tools, 'tools') and len(tools.tools) > 0:
	break
	else:
	raise ValueError("Empty tools list")
	except Exception as e:
	if attempt < max_retries - 1:
	wait_time = 1.0 * (attempt + 1) # Progressive wait
	logger.debug(f"Failed to list tools (attempt {attempt + 1}/{max_retries}), waiting {wait_time}s...")
	await asyncio.sleep(wait_time)
	else:
	logger.error(f"❌ Failed to list MCP tools after {max_retries} attempts: {e}")
	return ""

	if not tools or not hasattr(tools, 'tools'):
	logger.error("Invalid tools response from MCP server")
	return ""

	# Find generate_content tool
	generate_tool = None
	for tool in tools.tools:
	if tool.name == "generate_content" or "generate_content" in tool.name.lower():
	generate_tool = tool
	logger.info(f"Found Gemini MCP tool: {tool.name}")
	break

	if not generate_tool:
	logger.warning(f"Gemini MCP generate_content tool not found. Available tools: {[t.name for t in tools.tools]}")
	return ""

	# Prepare arguments
	arguments = {
	"user_prompt": user_prompt
	}
	if system_prompt:
	arguments["system_prompt"] = system_prompt
	if files:
	arguments["files"] = files
	if model:
	arguments["model"] = model
	if temperature is not None:
	arguments["temperature"] = temperature

	logger.info(f"🔧 [MCP] Calling Gemini MCP tool '{generate_tool.name}' for: {user_prompt[:100]}...")
	logger.info(f"📋 [MCP] Arguments: model={model}, temperature={temperature}, files={len(files) if files else 0}")
	result = await session.call_tool(generate_tool.name, arguments=arguments)

	# Parse result
	if hasattr(result, 'content') and result.content:
	for item in result.content:
	if hasattr(item, 'text'):
	response_text = item.text.strip()
	logger.info(f"✅ [MCP] Gemini MCP returned response ({len(response_text)} chars)")
	return response_text
	logger.warning("⚠️ [MCP] Gemini MCP returned empty or invalid result")
	return ""
	except Exception as e:
	logger.error(f"Gemini MCP call error: {e}")
	import traceback
	logger.debug(traceback.format_exc())
	return ""

	# initialize_medical_model is now imported from model.py


	def initialize_tts_model():
	"""Initialize TTS model for text-to-speech"""
	global global_tts_model
	if not TTS_AVAILABLE:
	logger.warning("TTS library not installed. TTS features will be disabled.")
	return None
	if global_tts_model is None:
	try:
	logger.info("Initializing TTS model for voice generation...")
	global_tts_model = TTS(model_name=TTS_MODEL, progress_bar=False)
	logger.info("TTS model initialized successfully")
	except Exception as e:
	logger.warning(f"TTS model initialization failed: {e}")
	logger.warning("TTS features will be disabled. If pyworld dependency is missing, try: pip install TTS --no-deps && pip install coqui-tts")
	global_tts_model = None
	return global_tts_model

	async def transcribe_audio_gemini(audio_path: str) -> str:
	"""Transcribe audio using Gemini MCP"""
	if not MCP_AVAILABLE:
	return ""

	try:
	# Ensure we have an absolute path
	audio_path_abs = os.path.abspath(audio_path)

	# Prepare file object for Gemini MCP using path (as per Gemini MCP documentation)
	files = [{
	"path": audio_path_abs
	}]

	# Use exact prompts from Gemini MCP documentation
	system_prompt = "You are a professional transcription service. Provide accurate, well-formatted transcripts."
	user_prompt = "Please transcribe this audio file. Include speaker identification if multiple speakers are present, and format it with proper punctuation and paragraphs, remove mumble, ignore non-verbal noises."

	result = await call_agent(
	user_prompt=user_prompt,
	system_prompt=system_prompt,
	files=files,
	model=GEMINI_MODEL_LITE, # Use lite model for transcription
	temperature=0.2
	)

	return result.strip()
	except Exception as e:
	logger.error(f"Gemini transcription error: {e}")
	import traceback
	logger.debug(traceback.format_exc())
	return ""

	def transcribe_audio(audio):
	"""Transcribe audio to text using Gemini MCP"""
	if audio is None:
	return ""

	try:
	# Handle file path (Gradio Audio component returns file path)
	if isinstance(audio, str):
	audio_path = audio
	elif isinstance(audio, tuple):
	# Handle tuple format (sample_rate, audio_data)
	sample_rate, audio_data = audio
	# Save to temp file
	with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
	sf.write(tmp_file.name, audio_data, samplerate=sample_rate)
	audio_path = tmp_file.name
	else:
	audio_path = audio

	# Use Gemini MCP for transcription
	if MCP_AVAILABLE:
	try:
	loop = asyncio.get_event_loop()
	if loop.is_running():
	try:
	import nest_asyncio
	transcribed = nest_asyncio.run(transcribe_audio_gemini(audio_path))
	if transcribed:
	logger.info(f"Transcribed via Gemini MCP: {transcribed[:50]}...")
	return transcribed
	except Exception as e:
	logger.error(f"Error in nested async transcription: {e}")
	else:
	transcribed = loop.run_until_complete(transcribe_audio_gemini(audio_path))
	if transcribed:
	logger.info(f"Transcribed via Gemini MCP: {transcribed[:50]}...")
	return transcribed
	except Exception as e:
	logger.error(f"Gemini MCP transcription error: {e}")

	logger.warning("Gemini MCP transcription not available")
	return ""
	except Exception as e:
	logger.error(f"Transcription error: {e}")
	return ""

	async def generate_speech_mcp(text: str) -> str:
	"""Generate speech using MCP TTS tool"""
	if not MCP_AVAILABLE:
	return None

	try:
	# Get MCP session
	session = await get_mcp_session()
	if session is None:
	return None

	# Find TTS tool
	tools = await session.list_tools()
	tts_tool = None
	for tool in tools.tools:
	if "tts" in tool.name.lower() or "speech" in tool.name.lower() or "synthesize" in tool.name.lower():
	tts_tool = tool
	logger.info(f"Found MCP TTS tool: {tool.name}")
	break

	if tts_tool:
	result = await session.call_tool(
	tts_tool.name,
	arguments={"text": text, "language": "en"}
	)

	# Parse result - MCP might return audio data or file path
	if hasattr(result, 'content') and result.content:
	for item in result.content:
	if hasattr(item, 'text'):
	# If it's a file path
	if os.path.exists(item.text):
	return item.text
	elif hasattr(item, 'data') and item.data:
	# If it's binary audio data, save it
	with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
	tmp_file.write(item.data)
	return tmp_file.name
	return None
	except Exception as e:
	logger.debug(f"MCP TTS error: {e}")
	return None

	def generate_speech(text: str):
	"""Generate speech from text using TTS model (with MCP fallback)"""
	if not text or len(text.strip()) == 0:
	return None

	# Try MCP first if available
	if MCP_AVAILABLE:
	try:
	loop = asyncio.get_event_loop()
	if loop.is_running():
	try:
	import nest_asyncio
	audio_path = nest_asyncio.run(generate_speech_mcp(text))
	if audio_path:
	logger.info("Generated speech via MCP")
	return audio_path
	except:
	pass
	else:
	audio_path = loop.run_until_complete(generate_speech_mcp(text))
	if audio_path:
	logger.info("Generated speech via MCP")
	return audio_path
	except Exception as e:
	logger.debug(f"MCP TTS not available: {e}")

	# Fallback to local TTS model
	if not TTS_AVAILABLE:
	logger.error("TTS library not installed. Please install TTS to use voice generation.")
	return None

	global global_tts_model
	if global_tts_model is None:
	initialize_tts_model()

	if global_tts_model is None:
	logger.error("TTS model not available. Please check dependencies.")
	return None

	try:
	# Generate audio
	wav = global_tts_model.tts(text)

	# Save to temporary file
	with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
	sf.write(tmp_file.name, wav, samplerate=22050)
	return tmp_file.name
	except Exception as e:
	logger.error(f"TTS error: {e}")
	return None

	def format_prompt_manually(messages: list, tokenizer) -> str:
	"""Manually format prompt for models without chat template"""
	prompt_parts = []

	# Combine system and user messages into a single instruction
	system_content = ""
	user_content = ""

	for msg in messages:
	role = msg.get("role", "user")
	content = msg.get("content", "")

	if role == "system":
	system_content = content
	elif role == "user":
	user_content = content
	elif role == "assistant":
	# Skip assistant messages in history for now (can be added if needed)
	pass

	# Format for MedAlpaca/LLaMA-based medical models
	# Common format: Instruction + Input -> Response
	if system_content:
	prompt = f"{system_content}\n\nQuestion: {user_content}\n\nAnswer:"
	else:
	prompt = f"Question: {user_content}\n\nAnswer:"

	return prompt

	def detect_language(text: str) -> str:
	"""Detect language of input text"""
	try:
	lang = detect(text)
	return lang
	except LangDetectException:
	return "en" # Default to English if detection fails

	def format_url_as_domain(url: str) -> str:
	"""Format URL as simple domain name (e.g., www.mayoclinic.org)"""
	if not url:
	return ""
	try:
	from urllib.parse import urlparse
	parsed = urlparse(url)
	domain = parsed.netloc or parsed.path
	# Remove www. prefix if present, but keep it for display
	if domain.startswith('www.'):
	return domain
	elif domain:
	return domain
	return url
	except Exception:
	# Fallback: try to extract domain manually
	if '://' in url:
	domain = url.split('://')[1].split('/')[0]
	return domain
	return url

	async def translate_text_gemini(text: str, target_lang: str = "en", source_lang: str = None) -> str:
	"""Translate text using Gemini MCP"""
	if source_lang:
	user_prompt = f"Translate the following {source_lang} text to {target_lang}. Only provide the translation, no explanations:\n\n{text}"
	else:
	user_prompt = f"Translate the following text to {target_lang}. Only provide the translation, no explanations:\n\n{text}"

	# Use concise system prompt
	system_prompt = "You are a professional translator. Translate accurately and concisely."

	result = await call_agent(
	user_prompt=user_prompt,
	system_prompt=system_prompt,
	model=GEMINI_MODEL_LITE, # Use lite model for translation
	temperature=0.2
	)

	return result.strip()

	def translate_text(text: str, target_lang: str = "en", source_lang: str = None) -> str:
	"""Translate text using Gemini MCP"""
	if not MCP_AVAILABLE:
	logger.warning("Gemini MCP not available for translation")
	return text # Return original text if translation fails

	try:
	loop = asyncio.get_event_loop()
	if loop.is_running():
	try:
	import nest_asyncio
	translated = nest_asyncio.run(translate_text_gemini(text, target_lang, source_lang))
	if translated:
	logger.info(f"Translated via Gemini MCP: {translated[:50]}...")
	return translated
	except Exception as e:
	logger.error(f"Error in nested async translation: {e}")
	else:
	translated = loop.run_until_complete(translate_text_gemini(text, target_lang, source_lang))
	if translated:
	logger.info(f"Translated via Gemini MCP: {translated[:50]}...")
	return translated
	except Exception as e:
	logger.error(f"Gemini MCP translation error: {e}")

	# Return original text if translation fails
	return text

	async def search_web_mcp_tool(query: str, max_results: int = 5) -> list:
	"""Search web using MCP web search tool (e.g., DuckDuckGo MCP server)"""
	if not MCP_AVAILABLE:
	return []

	try:
	session = await get_mcp_session()
	if session is None:
	return []

	# Retry listing tools if it fails the first time
	max_retries = 3
	tools = None
	for attempt in range(max_retries):
	try:
	tools = await session.list_tools()
	break
	except Exception as e:
	if attempt < max_retries - 1:
	await asyncio.sleep(0.5 * (attempt + 1))
	else:
	logger.error(f"Failed to list MCP tools after {max_retries} attempts: {e}")
	return []

	if not tools or not hasattr(tools, 'tools'):
	return []

	# Look for web search tools (DuckDuckGo, search, etc.)
	search_tool = None
	for tool in tools.tools:
	tool_name_lower = tool.name.lower()
	if any(keyword in tool_name_lower for keyword in ["search", "duckduckgo", "ddg", "web"]):
	search_tool = tool
	logger.info(f"Found web search MCP tool: {tool.name}")
	break

	if search_tool:
	try:
	logger.info(f"🔍 [MCP] Using web search MCP tool '{search_tool.name}' for: {query[:100]}...")
	# Call the search tool
	result = await session.call_tool(
	search_tool.name,
	arguments={"query": query, "max_results": max_results}
	)

	# Parse result
	web_content = []
	if hasattr(result, 'content') and result.content:
	for item in result.content:
	if hasattr(item, 'text'):
	try:
	data = json.loads(item.text)
	if isinstance(data, list):
	for entry in data[:max_results]:
	web_content.append({
	'title': entry.get('title', ''),
	'url': entry.get('url', entry.get('href', '')),
	'content': entry.get('body', entry.get('snippet', entry.get('content', '')))
	})
	elif isinstance(data, dict):
	if 'results' in data:
	for entry in data['results'][:max_results]:
	web_content.append({
	'title': entry.get('title', ''),
	'url': entry.get('url', entry.get('href', '')),
	'content': entry.get('body', entry.get('snippet', entry.get('content', '')))
	})
	else:
	web_content.append({
	'title': data.get('title', ''),
	'url': data.get('url', data.get('href', '')),
	'content': data.get('body', data.get('snippet', data.get('content', '')))
	})
	except json.JSONDecodeError:
	# If not JSON, treat as plain text
	web_content.append({
	'title': '',
	'url': '',
	'content': item.text[:1000]
	})

	if web_content:
	logger.info(f"✅ [MCP] Web search MCP tool returned {len(web_content)} results")
	return web_content
	except Exception as e:
	logger.error(f"Error calling web search MCP tool: {e}")

	return []
	except Exception as e:
	logger.error(f"Web search MCP tool error: {e}")
	return []

	async def search_web_mcp(query: str, max_results: int = 5) -> list:
	"""Search web using MCP tools - tries web search MCP tool first, then falls back to direct search"""
	# First try to use a dedicated web search MCP tool (like DuckDuckGo MCP server)
	results = await search_web_mcp_tool(query, max_results)
	if results:
	logger.info(f"✅ Web search via MCP tool: found {len(results)} results")
	return results

	# If no web search MCP tool available, use direct search (ddgs)
	# Note: Gemini MCP doesn't have web search capability, so we use direct API
	# The results will then be summarized using Gemini MCP
	logger.info("ℹ️ [Direct API] No web search MCP tool found, using direct DuckDuckGo search (results will be summarized with Gemini MCP)")
	return search_web_fallback(query, max_results)

	def search_web_fallback(query: str, max_results: int = 5) -> list:
	"""Fallback web search using DuckDuckGo directly (when MCP is not available)"""
	logger.info(f"🔍 [Direct API] Performing web search using DuckDuckGo API for: {query[:100]}...")
	# Always import here to ensure availability
	try:
	from ddgs import DDGS
	import requests
	from bs4 import BeautifulSoup
	except ImportError:
	logger.error("Fallback dependencies (ddgs, requests, beautifulsoup4) not available")
	return []

	try:
	with DDGS() as ddgs:
	results = list(ddgs.text(query, max_results=max_results))
	web_content = []
	for result in results:
	try:
	url = result.get('href', '')
	title = result.get('title', '')
	snippet = result.get('body', '')

	# Try to fetch full content
	try:
	response = requests.get(url, timeout=5, headers={'User-Agent': 'Mozilla/5.0'})
	if response.status_code == 200:
	soup = BeautifulSoup(response.content, 'html.parser')
	# Extract main content
	for script in soup(["script", "style"]):
	script.decompose()
	text = soup.get_text()
	# Clean and limit text
	lines = (line.strip() for line in text.splitlines())
	chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
	text = ' '.join(chunk for chunk in chunks if chunk)
	if len(text) > 1000:
	text = text[:1000] + "..."
	web_content.append({
	'title': title,
	'url': url,
	'content': snippet + "\n" + text[:500] if text else snippet
	})
	else:
	web_content.append({
	'title': title,
	'url': url,
	'content': snippet
	})
	except:
	web_content.append({
	'title': title,
	'url': url,
	'content': snippet
	})
	except Exception as e:
	logger.error(f"Error processing search result: {e}")
	continue
	logger.info(f"✅ [Direct API] Web search completed: {len(web_content)} results")
	return web_content
	except Exception as e:
	logger.error(f"❌ [Direct API] Web search error: {e}")
	return []

	def search_web(query: str, max_results: int = 5) -> list:
	"""Search web using MCP tools (synchronous wrapper) - prioritizes MCP over direct ddgs"""
	# Always try MCP first if available
	if MCP_AVAILABLE:
	try:
	# Run async MCP search
	try:
	loop = asyncio.get_event_loop()
	except RuntimeError:
	loop = asyncio.new_event_loop()
	asyncio.set_event_loop(loop)

	if loop.is_running():
	# If loop is already running, use nest_asyncio or create new thread
	try:
	import nest_asyncio
	results = nest_asyncio.run(search_web_mcp(query, max_results))
	if results: # Only return if we got results from MCP
	return results
	except (ImportError, AttributeError):
	# Fallback: run in thread
	import concurrent.futures
	with concurrent.futures.ThreadPoolExecutor() as executor:
	future = executor.submit(asyncio.run, search_web_mcp(query, max_results))
	results = future.result(timeout=30)
	if results: # Only return if we got results from MCP
	return results
	else:
	results = loop.run_until_complete(search_web_mcp(query, max_results))
	if results: # Only return if we got results from MCP
	return results
	except Exception as e:
	logger.error(f"Error running async MCP search: {e}")

	# Only use ddgs fallback if MCP is not available or returned no results
	logger.info("ℹ️ [Direct API] Falling back to direct DuckDuckGo search (MCP unavailable or returned no results)")
	return search_web_fallback(query, max_results)

	async def summarize_web_content_gemini(content_list: list, query: str) -> str:
	"""Summarize web search results using Gemini MCP"""
	logger.info(f"📝 [MCP] Summarizing {len(content_list)} web search results using Gemini MCP...")
	combined_content = "\n\n".join([f"Source: {item['title']}\n{item['content']}" for item in content_list[:3]])

	user_prompt = f"""Summarize the following web search results related to the query: "{query}"

	Extract key medical information, facts, and insights. Be concise and focus on reliable information.

	Search Results:
	{combined_content}

	Summary:"""

	# Use concise system prompt
	system_prompt = "You are a medical information summarizer. Extract and summarize key medical facts accurately."

	result = await call_agent(
	user_prompt=user_prompt,
	system_prompt=system_prompt,
	model=GEMINI_MODEL, # Use full model for summarization
	temperature=0.5
	)

	if result:
	logger.info(f"✅ [MCP] Web content summarized successfully using Gemini MCP ({len(result)} chars)")
	else:
	logger.warning("⚠️ [MCP] Gemini MCP summarization returned empty result")

	return result.strip()

	def summarize_web_content(content_list: list, query: str) -> str:
	"""Summarize web search results using Gemini MCP"""
	if not MCP_AVAILABLE:
	logger.warning("Gemini MCP not available for summarization")
	# Fallback: return first result's content
	if content_list:
	return content_list[0].get('content', '')[:500]
	return ""

	try:
	loop = asyncio.get_event_loop()
	if loop.is_running():
	try:
	import nest_asyncio
	summary = nest_asyncio.run(summarize_web_content_gemini(content_list, query))
	if summary:
	return summary
	except Exception as e:
	logger.error(f"Error in nested async summarization: {e}")
	else:
	summary = loop.run_until_complete(summarize_web_content_gemini(content_list, query))
	if summary:
	return summary
	except Exception as e:
	logger.error(f"Gemini MCP summarization error: {e}")

	# Fallback: return first result's content
	if content_list:
	return content_list[0].get('content', '')[:500]
	return ""

	# get_llm_for_rag is now imported from model.py as get_llm_for_rag_gpu

	async def autonomous_reasoning_gemini(query: str) -> dict:
	"""Autonomous reasoning using Gemini MCP"""
	logger.info(f"🧠 [MCP] Analyzing query with Gemini MCP: {query[:100]}...")
	reasoning_prompt = f"""Analyze this medical query and provide structured reasoning:

	Query: "{query}"

	Analyze:
	1. Query Type: (diagnosis, treatment, drug_info, symptom_analysis, research, general_info)
	2. Complexity: (simple, moderate, complex, multi_faceted)
	3. Information Needs: What specific information is required?
	4. Requires RAG: (yes/no) - Does this need document context?
	5. Requires Web Search: (yes/no) - Does this need current/updated information?
	6. Sub-questions: Break down into key sub-questions if complex

	Respond in JSON format:
	{{
	"query_type": "...",
	"complexity": "...",
	"information_needs": ["..."],
	"requires_rag": true/false,
	"requires_web_search": true/false,
	"sub_questions": ["..."]
	}}"""

	# Use concise system prompt
	system_prompt = "You are a medical reasoning system. Analyze queries systematically and provide structured JSON responses."

	response = await call_agent(
	user_prompt=reasoning_prompt,
	system_prompt=system_prompt,
	model=GEMINI_MODEL, # Use full model for reasoning
	temperature=0.3
	)

	# Parse JSON response (with fallback)
	try:
	# Extract JSON from response
	json_start = response.find('{')
	json_end = response.rfind('}') + 1
	if json_start >= 0 and json_end > json_start:
	reasoning = json.loads(response[json_start:json_end])
	else:
	raise ValueError("No JSON found")
	except:
	# Fallback reasoning
	reasoning = {
	"query_type": "general_info",
	"complexity": "moderate",
	"information_needs": ["medical information"],
	"requires_rag": True,
	"requires_web_search": False,
	"sub_questions": [query]
	}

	logger.info(f"Reasoning analysis: {reasoning}")
	return reasoning

	def autonomous_reasoning(query: str, history: list) -> dict:
	"""
	Autonomous reasoning: Analyze query complexity, intent, and information needs.
	Returns reasoning analysis with query type, complexity, and required information sources.
	Uses Gemini MCP for reasoning.
	"""
	if not MCP_AVAILABLE:
	logger.warning("⚠️ Gemini MCP not available for reasoning, using fallback")
	# Fallback reasoning
	return {
	"query_type": "general_info",
	"complexity": "moderate",
	"information_needs": ["medical information"],
	"requires_rag": True,
	"requires_web_search": False,
	"sub_questions": [query]
	}

	try:
	logger.info("🤔 [MCP] Using Gemini MCP for autonomous reasoning...")
	loop = asyncio.get_event_loop()
	if loop.is_running():
	try:
	import nest_asyncio
	reasoning = nest_asyncio.run(autonomous_reasoning_gemini(query))
	if reasoning and reasoning.get("query_type") != "general_info": # Check if we got real reasoning
	logger.info(f"✅ [MCP] Gemini MCP reasoning successful: {reasoning.get('query_type')}, complexity: {reasoning.get('complexity')}")
	return reasoning
	else:
	logger.warning("⚠️ [MCP] Gemini MCP returned fallback reasoning, using it anyway")
	return reasoning
	except Exception as e:
	logger.error(f"❌ Error in nested async reasoning: {e}")
	import traceback
	logger.debug(traceback.format_exc())
	else:
	reasoning = loop.run_until_complete(autonomous_reasoning_gemini(query))
	if reasoning and reasoning.get("query_type") != "general_info":
	logger.info(f"✅ [MCP] Gemini MCP reasoning successful: {reasoning.get('query_type')}, complexity: {reasoning.get('complexity')}")
	return reasoning
	else:
	logger.warning("⚠️ [MCP] Gemini MCP returned fallback reasoning, using it anyway")
	return reasoning
	except Exception as e:
	logger.error(f"❌ Gemini MCP reasoning error: {e}")
	import traceback
	logger.debug(traceback.format_exc())

	# Fallback reasoning only if all attempts failed
	logger.warning("⚠️ Falling back to default reasoning")
	return {
	"query_type": "general_info",
	"complexity": "moderate",
	"information_needs": ["medical information"],
	"requires_rag": True,
	"requires_web_search": False,
	"sub_questions": [query]
	}

	def create_execution_plan(reasoning: dict, query: str, has_rag_index: bool) -> dict:
	"""
	Planning: Create multi-step execution plan based on reasoning analysis.
	Returns execution plan with steps and strategy.
	"""
	plan = {
	"steps": [],
	"strategy": "sequential",
	"iterations": 1
	}

	# Determine execution strategy
	if reasoning["complexity"] in ["complex", "multi_faceted"]:
	plan["strategy"] = "iterative"
	plan["iterations"] = 2

	# Step 1: Language detection and translation
	plan["steps"].append({
	"step": 1,
	"action": "detect_language",
	"description": "Detect query language and translate if needed"
	})

	# Step 2: RAG retrieval (if needed and available)
	if reasoning.get("requires_rag", True) and has_rag_index:
	plan["steps"].append({
	"step": 2,
	"action": "rag_retrieval",
	"description": "Retrieve relevant document context",
	"parameters": {"top_k": 15, "merge_threshold": 0.5}
	})

	# Step 3: Web search (if needed)
	if reasoning.get("requires_web_search", False):
	plan["steps"].append({
	"step": 3,
	"action": "web_search",
	"description": "Search web for current/updated information",
	"parameters": {"max_results": 5}
	})

	# Step 4: Sub-question processing (if complex)
	if reasoning.get("sub_questions") and len(reasoning["sub_questions"]) > 1:
	plan["steps"].append({
	"step": 4,
	"action": "multi_step_reasoning",
	"description": "Process sub-questions iteratively",
	"sub_questions": reasoning["sub_questions"]
	})

	# Step 5: Synthesis and answer generation
	plan["steps"].append({
	"step": len(plan["steps"]) + 1,
	"action": "synthesize_answer",
	"description": "Generate comprehensive answer from all sources"
	})

	# Step 6: Self-reflection (for complex queries)
	if reasoning["complexity"] in ["complex", "multi_faceted"]:
	plan["steps"].append({
	"step": len(plan["steps"]) + 1,
	"action": "self_reflection",
	"description": "Evaluate answer quality and completeness"
	})

	logger.info(f"Execution plan created: {len(plan['steps'])} steps")
	return plan

	def autonomous_execution_strategy(reasoning: dict, plan: dict, use_rag: bool, use_web_search: bool, has_rag_index: bool) -> dict:
	"""
	Autonomous execution: Make decisions on information gathering strategy.
	Only suggests web search override, but respects user's RAG disable setting.
	"""
	strategy = {
	"use_rag": use_rag, # Respect user's RAG setting
	"use_web_search": use_web_search,
	"reasoning_override": False,
	"rationale": ""
	}

	# Only suggest web search override (RAG requires documents, so we respect user's choice)
	if reasoning.get("requires_web_search", False) and not use_web_search:
	strategy["use_web_search"] = True
	strategy["reasoning_override"] = True
	strategy["rationale"] += "Reasoning suggests web search for current information. "

	# Note: We don't override RAG setting because:
	# 1. User may have explicitly disabled it
	# 2. RAG requires documents to be uploaded
	# 3. We should respect user's explicit choice

	if strategy["reasoning_override"]:
	logger.info(f"Autonomous override: {strategy['rationale']}")

	return strategy

	async def self_reflection_gemini(answer: str, query: str) -> dict:
	"""Self-reflection using Gemini MCP"""
	reflection_prompt = f"""Evaluate this medical answer for quality and completeness:

	Query: "{query}"
	Answer: "{answer[:1000]}"

	Evaluate:
	1. Completeness: Does it address all aspects of the query?
	2. Accuracy: Is the medical information accurate?
	3. Clarity: Is it clear and well-structured?
	4. Sources: Are sources cited appropriately?
	5. Missing Information: What important information might be missing?

	Respond in JSON:
	{{
	"completeness_score": 0-10,
	"accuracy_score": 0-10,
	"clarity_score": 0-10,
	"overall_score": 0-10,
	"missing_aspects": ["..."],
	"improvement_suggestions": ["..."]
	}}"""

	# Use concise system prompt
	system_prompt = "You are a medical answer quality evaluator. Provide honest, constructive feedback."

	response = await call_agent(
	user_prompt=reflection_prompt,
	system_prompt=system_prompt,
	model=GEMINI_MODEL, # Use full model for reflection
	temperature=0.3
	)

	try:
	json_start = response.find('{')
	json_end = response.rfind('}') + 1
	if json_start >= 0 and json_end > json_start:
	reflection = json.loads(response[json_start:json_end])
	else:
	reflection = {"overall_score": 7, "improvement_suggestions": []}
	except:
	reflection = {"overall_score": 7, "improvement_suggestions": []}

	logger.info(f"Self-reflection score: {reflection.get('overall_score', 'N/A')}")
	return reflection

	def self_reflection(answer: str, query: str, reasoning: dict) -> dict:
	"""
	Self-reflection: Evaluate answer quality and completeness.
	Returns reflection with quality score and improvement suggestions.
	"""
	if not MCP_AVAILABLE:
	logger.warning("Gemini MCP not available for reflection, using fallback")
	return {"overall_score": 7, "improvement_suggestions": []}

	try:
	loop = asyncio.get_event_loop()
	if loop.is_running():
	try:
	import nest_asyncio
	return nest_asyncio.run(self_reflection_gemini(answer, query))
	except Exception as e:
	logger.error(f"Error in nested async reflection: {e}")
	else:
	return loop.run_until_complete(self_reflection_gemini(answer, query))
	except Exception as e:
	logger.error(f"Gemini MCP reflection error: {e}")

	return {"overall_score": 7, "improvement_suggestions": []}

	async def parse_document_gemini(file_path: str, file_extension: str) -> str:
	"""Parse document using Gemini MCP"""
	if not MCP_AVAILABLE:
	return ""

	try:
	# Read file and encode to base64
	with open(file_path, 'rb') as f:
	file_content = base64.b64encode(f.read()).decode('utf-8')

	# Determine MIME type from file extension
	mime_type_map = {
	'.pdf': 'application/pdf',
	'.doc': 'application/msword',
	'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
	'.txt': 'text/plain',
	'.md': 'text/markdown',
	'.json': 'application/json',
	'.xml': 'application/xml',
	'.csv': 'text/csv'
	}
	mime_type = mime_type_map.get(file_extension, 'application/octet-stream')

	# Prepare file object for Gemini MCP (use content for base64)
	files = [{
	"content": file_content,
	"type": mime_type
	}]

	# Use concise system prompt
	system_prompt = "Extract all text content from the document accurately."
	user_prompt = "Extract all text content from this document. Return only the extracted text, preserving structure and formatting where possible."

	result = await call_agent(
	user_prompt=user_prompt,
	system_prompt=system_prompt,
	files=files,
	model=GEMINI_MODEL_LITE, # Use lite model for parsing
	temperature=0.2
	)

	return result.strip()
	except Exception as e:
	logger.error(f"Gemini document parsing error: {e}")
	import traceback
	logger.debug(traceback.format_exc())
	return ""

	def extract_text_from_document(file):
	"""Extract text from document using Gemini MCP"""
	file_name = file.name
	file_extension = os.path.splitext(file_name)[1].lower()

	# Handle text files directly
	if file_extension == '.txt':
	text = file.read().decode('utf-8')
	return text, len(text.split()), None

	# For PDF, Word, and other documents, use Gemini MCP
	# Save file to temporary location for processing
	try:
	with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as tmp_file:
	# Write file content to temp file
	file.seek(0) # Reset file pointer
	tmp_file.write(file.read())
	tmp_file_path = tmp_file.name

	# Use Gemini MCP to parse document
	if MCP_AVAILABLE:
	try:
	loop = asyncio.get_event_loop()
	if loop.is_running():
	try:
	import nest_asyncio
	text = nest_asyncio.run(parse_document_gemini(tmp_file_path, file_extension))
	except Exception as e:
	logger.error(f"Error in nested async document parsing: {e}")
	text = ""
	else:
	text = loop.run_until_complete(parse_document_gemini(tmp_file_path, file_extension))

	# Clean up temp file
	try:
	os.unlink(tmp_file_path)
	except:
	pass

	if text:
	return text, len(text.split()), None
	else:
	return None, 0, ValueError(f"Failed to extract text from {file_extension} file using Gemini MCP")
	except Exception as e:
	logger.error(f"Gemini MCP document parsing error: {e}")
	# Clean up temp file
	try:
	os.unlink(tmp_file_path)
	except:
	pass
	return None, 0, ValueError(f"Error parsing {file_extension} file: {str(e)}")
	else:
	# Clean up temp file
	try:
	os.unlink(tmp_file_path)
	except:
	pass
	return None, 0, ValueError(f"Gemini MCP not available. Cannot parse {file_extension} files.")
	except Exception as e:
	logger.error(f"Error processing document: {e}")
	return None, 0, ValueError(f"Error processing {file_extension} file: {str(e)}")

	def create_or_update_index(files, request: gr.Request):
	global global_file_info

	if not files:
	return "Please provide files.", ""

	start_time = time.time()
	user_id = request.session_hash
	save_dir = f"./{user_id}_index"
	# Initialize LlamaIndex modules - use GPU functions for model inference only
	llm = get_llm_for_rag_gpu()
	embed_model = get_embedding_model_gpu()
	Settings.llm = llm
	Settings.embed_model = embed_model
	file_stats = []
	new_documents = []

	for file in tqdm(files, desc="Processing files"):
	file_basename = os.path.basename(file.name)
	text, word_count, error = extract_text_from_document(file)
	if error:
	logger.error(f"Error processing file {file_basename}: {str(error)}")
	file_stats.append({
	"name": file_basename,
	"words": 0,
	"status": f"error: {str(error)}"
	})
	continue

	doc = LlamaDocument(
	text=text,
	metadata={
	"file_name": file_basename,
	"word_count": word_count,
	"source": "user_upload"
	}
	)
	new_documents.append(doc)

	file_stats.append({
	"name": file_basename,
	"words": word_count,
	"status": "processed"
	})

	global_file_info[file_basename] = {
	"word_count": word_count,
	"processed_at": time.time()
	}

	node_parser = HierarchicalNodeParser.from_defaults(
	chunk_sizes=[2048, 512, 128],
	chunk_overlap=20
	)
	logger.info(f"Parsing {len(new_documents)} documents into hierarchical nodes")
	new_nodes = node_parser.get_nodes_from_documents(new_documents)
	new_leaf_nodes = get_leaf_nodes(new_nodes)
	new_root_nodes = get_root_nodes(new_nodes)
	logger.info(f"Generated {len(new_nodes)} total nodes ({len(new_root_nodes)} root, {len(new_leaf_nodes)} leaf)")

	if os.path.exists(save_dir):
	logger.info(f"Loading existing index from {save_dir}")
	storage_context = StorageContext.from_defaults(persist_dir=save_dir)
	index = load_index_from_storage(storage_context, settings=Settings)
	docstore = storage_context.docstore

	docstore.add_documents(new_nodes)
	for node in tqdm(new_leaf_nodes, desc="Adding leaf nodes to index"):
	index.insert_nodes([node])

	total_docs = len(docstore.docs)
	logger.info(f"Updated index with {len(new_nodes)} new nodes from {len(new_documents)} files")
	else:
	logger.info("Creating new index")
	docstore = SimpleDocumentStore()
	storage_context = StorageContext.from_defaults(docstore=docstore)
	docstore.add_documents(new_nodes)

	index = VectorStoreIndex(
	new_leaf_nodes,
	storage_context=storage_context,
	settings=Settings
	)
	total_docs = len(new_documents)
	logger.info(f"Created new index with {len(new_nodes)} nodes from {len(new_documents)} files")

	index.storage_context.persist(persist_dir=save_dir)
	# custom outputs after processing files
	file_list_html = "<div class='file-list'>"
	for stat in file_stats:
	status_color = "#4CAF50" if stat["status"] == "processed" else "#f44336"
	file_list_html += f"<div><span style='color:{status_color}'>●</span> {stat['name']} - {stat['words']} words</div>"
	file_list_html += "</div>"
	processing_time = time.time() - start_time
	stats_output = f"<div class='stats-box'>"
	stats_output += f"✓ Processed {len(files)} files in {processing_time:.2f} seconds<br>"
	stats_output += f"✓ Created {len(new_nodes)} nodes ({len(new_leaf_nodes)} leaf nodes)<br>"
	stats_output += f"✓ Total documents in index: {total_docs}<br>"
	stats_output += f"✓ Index saved to: {save_dir}<br>"
	stats_output += "</div>"
	output_container = f"<div class='info-container'>"
	output_container += file_list_html
	output_container += stats_output
	output_container += "</div>"
	return f"Successfully indexed {len(files)} files.", output_container

	def stream_chat(
	message: str,
	history: list,
	system_prompt: str,
	temperature: float,
	max_new_tokens: int,
	top_p: float,
	top_k: int,
	penalty: float,
	retriever_k: int,
	merge_threshold: float,
	use_rag: bool,
	medical_model: str,
	use_web_search: bool,
	request: gr.Request
	):
	if not request:
	yield history + [{"role": "assistant", "content": "Session initialization failed. Please refresh the page."}]
	return

	user_id = request.session_hash
	index_dir = f"./{user_id}_index"
	has_rag_index = os.path.exists(index_dir)

	# ===== AUTONOMOUS REASONING =====
	logger.info("🤔 Starting autonomous reasoning...")
	reasoning = autonomous_reasoning(message, history)

	# ===== PLANNING =====
	logger.info("📋 Creating execution plan...")
	plan = create_execution_plan(reasoning, message, has_rag_index)

	# ===== AUTONOMOUS EXECUTION STRATEGY =====
	logger.info("🎯 Determining execution strategy...")
	execution_strategy = autonomous_execution_strategy(reasoning, plan, use_rag, use_web_search, has_rag_index)

	# Use autonomous strategy decisions (respect user's RAG setting)
	final_use_rag = execution_strategy["use_rag"] and has_rag_index # Only use RAG if enabled AND documents exist
	final_use_web_search = execution_strategy["use_web_search"]

	# Show reasoning override message if applicable
	reasoning_note = ""
	if execution_strategy["reasoning_override"]:
	reasoning_note = f"\n\n💡 Autonomous Reasoning: {execution_strategy['rationale']}"

	# Detect language and translate if needed (Step 1 of plan)
	original_lang = detect_language(message)
	original_message = message
	needs_translation = original_lang != "en"

	if needs_translation:
	logger.info(f"Detected non-English language: {original_lang}, translating to English...")
	message = translate_text(message, target_lang="en", source_lang=original_lang)
	logger.info(f"Translated query: {message}")

	# Initialize medical model
	medical_model_obj, medical_tokenizer = initialize_medical_model(medical_model)

	# Adjust system prompt based on RAG setting and reasoning
	if final_use_rag:
	base_system_prompt = system_prompt if system_prompt else "As a medical specialist, provide clinical and concise answers based on the provided medical documents and context."
	else:
	base_system_prompt = "As a medical specialist, provide short and concise clinical answers. Be brief and avoid lengthy explanations. Focus on key medical facts only."

	# Add reasoning context to system prompt for complex queries
	if reasoning["complexity"] in ["complex", "multi_faceted"]:
	base_system_prompt += f"\n\nQuery Analysis: This is a {reasoning['complexity']} {reasoning['query_type']} query. Address all sub-questions: {', '.join(reasoning.get('sub_questions', [])[:3])}"

	# ===== EXECUTION: RAG Retrieval (Step 2) =====
	rag_context = ""
	source_info = ""
	if final_use_rag and has_rag_index:
	# Use GPU function for embedding model
	embed_model = get_embedding_model_gpu()
	Settings.embed_model = embed_model
	storage_context = StorageContext.from_defaults(persist_dir=index_dir)
	index = load_index_from_storage(storage_context, settings=Settings)
	base_retriever = index.as_retriever(similarity_top_k=retriever_k)
	auto_merging_retriever = AutoMergingRetriever(
	base_retriever,
	storage_context=storage_context,
	simple_ratio_thresh=merge_threshold,
	verbose=True
	)
	logger.info(f"Query: {message}")
	retrieval_start = time.time()
	merged_nodes = auto_merging_retriever.retrieve(message)
	logger.info(f"Retrieved {len(merged_nodes)} merged nodes in {time.time() - retrieval_start:.2f}s")
	merged_file_sources = {}
	for node in merged_nodes:
	if hasattr(node.node, 'metadata') and 'file_name' in node.node.metadata:
	file_name = node.node.metadata['file_name']
	if file_name not in merged_file_sources:
	merged_file_sources[file_name] = 0
	merged_file_sources[file_name] += 1
	logger.info(f"Merged retrieval file distribution: {merged_file_sources}")
	rag_context = "\n\n".join([n.node.text for n in merged_nodes])
	if merged_file_sources:
	source_info = "\n\nRetrieved information from files: " + ", ".join(merged_file_sources.keys())

	# ===== EXECUTION: Web Search (Step 3) =====
	web_context = ""
	web_sources = []
	web_urls = [] # Store URLs for citations
	if final_use_web_search:
	logger.info("🌐 Performing web search (will use Gemini MCP for summarization)...")
	web_results = search_web(message, max_results=5)
	if web_results:
	logger.info(f"📊 Found {len(web_results)} web search results, now summarizing with Gemini MCP...")
	web_summary = summarize_web_content(web_results, message)
	if web_summary and len(web_summary) > 50: # Check if we got a real summary
	logger.info(f"✅ [MCP] Gemini MCP summarization successful ({len(web_summary)} chars)")
	web_context = f"\n\nAdditional Web Sources (summarized with Gemini MCP):\n{web_summary}"
	else:
	logger.warning("⚠️ [MCP] Gemini MCP summarization failed or returned empty, using raw results")
	# Fallback: use first result's content
	web_context = f"\n\nAdditional Web Sources:\n{web_results[0].get('content', '')[:500]}"
	web_sources = [r['title'] for r in web_results[:3]]
	# Extract unique URLs for citations
	web_urls = [r.get('url', '') for r in web_results if r.get('url')]
	logger.info(f"✅ Web search completed: {len(web_results)} results, summarized with Gemini MCP")
	else:
	logger.warning("⚠️ Web search returned no results")

	# Build final context
	context_parts = []
	if rag_context:
	context_parts.append(f"Document Context:\n{rag_context}")
	if web_context:
	context_parts.append(web_context)

	full_context = "\n\n".join(context_parts) if context_parts else ""

	# Build system prompt
	if final_use_rag or final_use_web_search:
	formatted_system_prompt = f"{base_system_prompt}\n\n{full_context}{source_info}"
	else:
	formatted_system_prompt = base_system_prompt

	# Prepare messages
	messages = [{"role": "system", "content": formatted_system_prompt}]
	for entry in history:
	messages.append(entry)
	messages.append({"role": "user", "content": message})

	# Get EOS token and adjust stopping criteria
	eos_token_id = medical_tokenizer.eos_token_id
	if eos_token_id is None:
	eos_token_id = medical_tokenizer.pad_token_id

	# Increase max tokens for medical models (prevent early stopping)
	max_new_tokens = int(max_new_tokens) if isinstance(max_new_tokens, (int, float)) else 2048
	max_new_tokens = max(max_new_tokens, 1024) # Minimum 1024 tokens for medical answers

	# Check if tokenizer has chat template, otherwise format manually
	if hasattr(medical_tokenizer, 'chat_template') and medical_tokenizer.chat_template is not None:
	try:
	prompt = medical_tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True
	)
	except Exception as e:
	logger.warning(f"Chat template failed, using manual formatting: {e}")
	# Fallback to manual formatting
	prompt = format_prompt_manually(messages, medical_tokenizer)
	else:
	# Manual formatting for models without chat template
	prompt = format_prompt_manually(messages, medical_tokenizer)

	inputs = medical_tokenizer(prompt, return_tensors="pt").to(medical_model_obj.device)
	prompt_length = inputs['input_ids'].shape[1]

	stop_event = threading.Event()

	class StopOnEvent(StoppingCriteria):
	def __init__(self, stop_event):
	super().__init__()
	self.stop_event = stop_event

	def __call__(self, input_ids, scores, **kwargs):
	return self.stop_event.is_set()

	# Custom stopping criteria that doesn't stop on EOS too early
	class MedicalStoppingCriteria(StoppingCriteria):
	def __init__(self, eos_token_id, prompt_length, min_new_tokens=100):
	super().__init__()
	self.eos_token_id = eos_token_id
	self.prompt_length = prompt_length
	self.min_new_tokens = min_new_tokens

	def __call__(self, input_ids, scores, **kwargs):
	current_length = input_ids.shape[1]
	new_tokens = current_length - self.prompt_length
	last_token = input_ids[0, -1].item()

	# Don't stop on EOS if we haven't generated enough new tokens
	if new_tokens < self.min_new_tokens:
	return False
	# Allow EOS after minimum new tokens have been generated
	return last_token == self.eos_token_id

	stopping_criteria = StoppingCriteriaList([
	StopOnEvent(stop_event),
	MedicalStoppingCriteria(eos_token_id, prompt_length, min_new_tokens=100)
	])

	streamer = TextIteratorStreamer(
	medical_tokenizer,
	skip_prompt=True,
	skip_special_tokens=True
	)

	temperature = float(temperature) if isinstance(temperature, (int, float)) else 0.7
	top_p = float(top_p) if isinstance(top_p, (int, float)) else 0.95
	top_k = int(top_k) if isinstance(top_k, (int, float)) else 50
	penalty = float(penalty) if isinstance(penalty, (int, float)) else 1.2

	# Call GPU function for model inference only
	thread = threading.Thread(
	target=generate_with_medswin,
	kwargs={
	"medical_model_obj": medical_model_obj,
	"medical_tokenizer": medical_tokenizer,
	"prompt": prompt,
	"max_new_tokens": max_new_tokens,
	"temperature": temperature,
	"top_p": top_p,
	"top_k": top_k,
	"penalty": penalty,
	"eos_token_id": eos_token_id,
	"pad_token_id": medical_tokenizer.pad_token_id or eos_token_id,
	"stop_event": stop_event,
	"streamer": streamer,
	"stopping_criteria": stopping_criteria
	}
	)
	thread.start()

	updated_history = history + [
	{"role": "user", "content": original_message},
	{"role": "assistant", "content": ""}
	]
	yield updated_history

	partial_response = ""
	try:
	for new_text in streamer:
	partial_response += new_text
	updated_history[-1]["content"] = partial_response
	yield updated_history

	# ===== SELF-REFLECTION (Step 6) =====
	if reasoning["complexity"] in ["complex", "multi_faceted"]:
	logger.info("🔍 Performing self-reflection on answer quality...")
	reflection = self_reflection(partial_response, message, reasoning)

	# Add reflection note if score is low or improvements suggested
	if reflection.get("overall_score", 10) < 7 or reflection.get("improvement_suggestions"):
	reflection_note = f"\n\n---\nSelf-Reflection (Score: {reflection.get('overall_score', 'N/A')}/10)"
	if reflection.get("improvement_suggestions"):
	reflection_note += f"\n💡 Suggestions: {', '.join(reflection['improvement_suggestions'][:2])}"
	partial_response += reflection_note
	updated_history[-1]["content"] = partial_response

	# Add reasoning note if autonomous override occurred
	if reasoning_note:
	partial_response = reasoning_note + "\n\n" + partial_response
	updated_history[-1]["content"] = partial_response

	# Translate back if needed
	if needs_translation and partial_response:
	logger.info(f"Translating response back to {original_lang}...")
	translated_response = translate_text(partial_response, target_lang=original_lang, source_lang="en")
	partial_response = translated_response

	# Add citations if web sources were used
	citations_text = ""
	if web_urls:
	# Get unique domains
	unique_urls = list(dict.fromkeys(web_urls)) # Preserve order, remove duplicates
	citation_links = []
	for url in unique_urls[:5]: # Limit to 5 citations
	domain = format_url_as_domain(url)
	if domain:
	# Create markdown link: [domain](url)
	citation_links.append(f"[{domain}]({url})")

	if citation_links:
	citations_text = "\n\nSources: " + ", ".join(citation_links)

	# Add speaker icon and citations to assistant message
	speaker_icon = ' 🔊'
	partial_response_with_speaker = partial_response + citations_text + speaker_icon
	updated_history[-1]["content"] = partial_response_with_speaker

	yield updated_history

	except GeneratorExit:
	stop_event.set()
	thread.join()
	raise

	def generate_speech_for_message(text: str):
	"""Generate speech for a message and return audio file"""
	audio_path = generate_speech(text)
	if audio_path:
	return audio_path
	return None

	def create_demo():
	with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as demo:
	gr.HTML(TITLE)
	gr.HTML(DESCRIPTION)

	with gr.Row(elem_classes="main-container"):
	with gr.Column(elem_classes="upload-section"):
	file_upload = gr.File(
	file_count="multiple",
	label="Drag and Drop Files Here",
	file_types=[".pdf", ".txt", ".doc", ".docx", ".md", ".json", ".xml", ".csv"],
	elem_id="file-upload"
	)
	upload_button = gr.Button("Upload & Index", elem_classes="upload-button")
	status_output = gr.Textbox(
	label="Status",
	placeholder="Upload files to start...",
	interactive=False
	)
	file_info_output = gr.HTML(
	label="File Information",
	elem_classes="processing-info"
	)
	upload_button.click(
	fn=create_or_update_index,
	inputs=[file_upload],
	outputs=[status_output, file_info_output]
	)

	with gr.Column(elem_classes="chatbot-container"):
	chatbot = gr.Chatbot(
	height=500,
	placeholder="Chat with MedSwin... Type your question below.",
	show_label=False,
	type="messages"
	)
	with gr.Row(elem_classes="input-row"):
	message_input = gr.Textbox(
	placeholder="Type your medical question here...",
	show_label=False,
	container=False,
	lines=1,
	scale=10
	)
	mic_button = gr.Audio(
	sources=["microphone"],
	type="filepath",
	label="",
	show_label=False,
	container=False,
	scale=1
	)
	submit_button = gr.Button("➤", elem_classes="submit-btn", scale=1)

	# Timer display for recording (shown below input row)
	recording_timer = gr.Textbox(
	value="",
	label="",
	show_label=False,
	interactive=False,
	visible=False,
	container=False,
	elem_classes="recording-timer"
	)

	# Handle microphone transcription
	import time
	recording_start_time = [None]

	def handle_recording_start():
	"""Called when recording starts"""
	recording_start_time[0] = time.time()
	return gr.update(visible=True, value="Recording... 0s")

	def handle_recording_stop(audio):
	"""Called when recording stops"""
	recording_start_time[0] = None
	if audio is None:
	return gr.update(visible=False, value=""), ""
	transcribed = transcribe_audio(audio)
	return gr.update(visible=False, value=""), transcribed

	# Use JavaScript for timer updates (simpler than Gradio Timer)
	mic_button.start_recording(
	fn=handle_recording_start,
	outputs=[recording_timer]
	)

	mic_button.stop_recording(
	fn=handle_recording_stop,
	inputs=[mic_button],
	outputs=[recording_timer, message_input]
	)

	# TTS component for generating speech from messages
	with gr.Row(visible=False) as tts_row:
	tts_text = gr.Textbox(visible=False)
	tts_audio = gr.Audio(label="Generated Speech", visible=False)

	# Function to generate speech when speaker icon is clicked
	def generate_speech_from_chat(history):
	"""Extract last assistant message and generate speech"""
	if not history or len(history) == 0:
	return None
	last_msg = history[-1]
	if last_msg.get("role") == "assistant":
	text = last_msg.get("content", "").replace(" 🔊", "").strip()
	if text:
	audio_path = generate_speech(text)
	return audio_path
	return None

	# Add TTS button that appears when assistant responds
	tts_button = gr.Button("🔊 Play Response", visible=False, size="sm")

	# Update TTS button visibility and generate speech
	def update_tts_button(history):
	if history and len(history) > 0 and history[-1].get("role") == "assistant":
	return gr.update(visible=True)
	return gr.update(visible=False)

	chatbot.change(
	fn=update_tts_button,
	inputs=[chatbot],
	outputs=[tts_button]
	)

	tts_button.click(
	fn=generate_speech_from_chat,
	inputs=[chatbot],
	outputs=[tts_audio]
	)

	with gr.Accordion("⚙️ Advanced Settings", open=False):
	with gr.Row():
	use_rag = gr.Checkbox(
	value=False,
	label="Enable Document RAG",
	info="Answer based on uploaded documents (requires document upload)"
	)
	use_web_search = gr.Checkbox(
	value=False,
	label="Enable Web Search (MCP)",
	info="Fetch knowledge from online medical resources"
	)

	medical_model = gr.Radio(
	choices=list(MEDSWIN_MODELS.keys()),
	value=DEFAULT_MEDICAL_MODEL,
	label="Medical Model",
	info="MedSwin TA (default), others download on first use"
	)

	system_prompt = gr.Textbox(
	value="As a medical specialist, provide detailed and accurate answers based on the provided medical documents and context. Ensure all information is clinically accurate and cite sources when available.",
	label="System Prompt",
	lines=3
	)

	with gr.Tab("Generation Parameters"):
	temperature = gr.Slider(
	minimum=0,
	maximum=1,
	step=0.1,
	value=0.2,
	label="Temperature"
	)
	max_new_tokens = gr.Slider(
	minimum=512,
	maximum=4096,
	step=128,
	value=2048,
	label="Max New Tokens",
	info="Increased for medical models to prevent early stopping"
	)
	top_p = gr.Slider(
	minimum=0.0,
	maximum=1.0,
	step=0.1,
	value=0.7,
	label="Top P"
	)
	top_k = gr.Slider(
	minimum=1,
	maximum=100,
	step=1,
	value=50,
	label="Top K"
	)
	penalty = gr.Slider(
	minimum=0.0,
	maximum=2.0,
	step=0.1,
	value=1.2,
	label="Repetition Penalty"
	)

	with gr.Tab("Retrieval Parameters"):
	retriever_k = gr.Slider(
	minimum=5,
	maximum=30,
	step=1,
	value=15,
	label="Initial Retrieval Size (Top K)"
	)
	merge_threshold = gr.Slider(
	minimum=0.1,
	maximum=0.9,
	step=0.1,
	value=0.5,
	label="Merge Threshold (lower = more merging)"
	)

	submit_button.click(
	fn=stream_chat,
	inputs=[
	message_input,
	chatbot,
	system_prompt,
	temperature,
	max_new_tokens,
	top_p,
	top_k,
	penalty,
	retriever_k,
	merge_threshold,
	use_rag,
	medical_model,
	use_web_search
	],
	outputs=chatbot
	)

	message_input.submit(
	fn=stream_chat,
	inputs=[
	message_input,
	chatbot,
	system_prompt,
	temperature,
	max_new_tokens,
	top_p,
	top_k,
	penalty,
	retriever_k,
	merge_threshold,
	use_rag,
	medical_model,
	use_web_search
	],
	outputs=chatbot
	)

	return demo

	if __name__ == "__main__":
	# Preload models on startup
	logger.info("Preloading models on startup...")
	logger.info("Initializing default medical model (MedSwin TA)...")
	initialize_medical_model(DEFAULT_MEDICAL_MODEL)
	logger.info("Preloading TTS model...")
	try:
	initialize_tts_model()
	if global_tts_model is not None:
	logger.info("TTS model preloaded successfully!")
	else:
	logger.warning("TTS model not available - will use MCP or disable voice generation")
	except Exception as e:
	logger.warning(f"TTS model preloading failed: {e}")
	logger.warning("Text-to-speech will use MCP or be disabled")

	# Check Gemini MCP availability
	if MCP_AVAILABLE:
	logger.info("Gemini MCP is available for translation, summarization, document parsing, and transcription")
	else:
	logger.warning("Gemini MCP not available - translation, summarization, document parsing, and transcription features will be limited")

	logger.info("Model preloading complete!")
	demo = create_demo()
	demo.launch()