Spaces:

MCP-1st-Birthday
/

MedLLM-Agent

Running on Zero

App Files Files Community

Y Phung Nguyen commited on 19 days ago

Commit

f7415cc

1 Parent(s): 09d7494

Fix model preloader

Browse files

Files changed (3) hide show

models.py +115 -20
pipeline.py +28 -1
ui.py +77 -11

models.py CHANGED Viewed

@@ -57,26 +57,56 @@ def is_model_loaded(model_name: str) -> bool:
                 config.global_medical_models[model_name] is not None and
                 _model_loading_states.get(model_name) == "loaded")
-def initialize_medical_model(model_name: str):
-    """Initialize medical model (MedSwin) - download on demand"""
     if model_name not in config.global_medical_models or config.global_medical_models[model_name] is None:
         set_model_loading_state(model_name, "loading")
-        logger.info(f"Initializing medical model: {model_name}...")
         try:
-            # Clear GPU cache before loading to prevent memory issues
-            if torch.cuda.is_available():
-                torch.cuda.empty_cache()
-                logger.debug("Cleared GPU cache before model loading")
             model_path = config.MEDSWIN_MODELS[model_name]
             tokenizer = AutoTokenizer.from_pretrained(model_path, token=config.HF_TOKEN)
-            model = AutoModelForCausalLM.from_pretrained(
-                model_path,
-                device_map="auto",
-                trust_remote_code=True,
-                token=config.HF_TOKEN,
-                torch_dtype=torch.float16
-            )
             # Set models in config BEFORE setting state to "loaded"
             config.global_medical_models[model_name] = model
             config.global_medical_tokenizers[model_name] = tokenizer
@@ -87,11 +117,6 @@ def initialize_medical_model(model_name: str):
             # Verify the state was set correctly
             if not is_model_loaded(model_name):
                 logger.warning(f"Model {model_name} initialized but is_model_loaded() returns False. State: {get_model_loading_state(model_name)}, in dict: {model_name in config.global_medical_models}")
-            # Clear cache after loading to free up temporary memory
-            if torch.cuda.is_available():
-                torch.cuda.empty_cache()
-                logger.debug("Cleared GPU cache after model loading")
         except Exception as e:
             set_model_loading_state(model_name, "error")
             logger.error(f"Failed to initialize medical model {model_name}: {e}")
@@ -106,6 +131,76 @@ def initialize_medical_model(model_name: str):
             set_model_loading_state(model_name, "loaded")
     return config.global_medical_models[model_name], config.global_medical_tokenizers[model_name]
 def initialize_tts_model():
     """Initialize TTS model for text-to-speech"""
     if not TTS_AVAILABLE:

                 config.global_medical_models[model_name] is not None and
                 _model_loading_states.get(model_name) == "loaded")
+def initialize_medical_model(model_name: str, load_to_gpu: bool = True):
+    """
+    Initialize medical model (MedSwin) - download on demand
+    According to ZeroGPU best practices:
+    - If load_to_gpu=True: Load directly to GPU using device_map="auto" (must be called within @spaces.GPU decorated function)
+    - If load_to_gpu=False: Load to CPU first, then move to GPU in inference function
+    Args:
+        model_name: Name of the model to load
+        load_to_gpu: If True, load directly to GPU. If False, load to CPU (for ZeroGPU best practices)
+    """
     if model_name not in config.global_medical_models or config.global_medical_models[model_name] is None:
         set_model_loading_state(model_name, "loading")
+        logger.info(f"Initializing medical model: {model_name}... (load_to_gpu={load_to_gpu})")
         try:
             model_path = config.MEDSWIN_MODELS[model_name]
             tokenizer = AutoTokenizer.from_pretrained(model_path, token=config.HF_TOKEN)
+            if load_to_gpu:
+                # Load directly to GPU (must be within @spaces.GPU decorated function)
+                # Clear GPU cache before loading to prevent memory issues
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+                    logger.debug("Cleared GPU cache before model loading")
+                model = AutoModelForCausalLM.from_pretrained(
+                    model_path,
+                    device_map="auto",  # Automatically places model on GPU
+                    trust_remote_code=True,
+                    token=config.HF_TOKEN,
+                    torch_dtype=torch.float16
+                )
+                # Clear cache after loading
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+                    logger.debug("Cleared GPU cache after model loading")
+            else:
+                # Load to CPU first (ZeroGPU best practice - no GPU decorator needed)
+                logger.info(f"Loading {model_name} to CPU (will move to GPU during inference)...")
+                model = AutoModelForCausalLM.from_pretrained(
+                    model_path,
+                    device_map="cpu",  # Load to CPU
+                    trust_remote_code=True,
+                    token=config.HF_TOKEN,
+                    torch_dtype=torch.float16
+                )
+                logger.info(f"Model {model_name} loaded to CPU successfully")
             # Set models in config BEFORE setting state to "loaded"
             config.global_medical_models[model_name] = model
             config.global_medical_tokenizers[model_name] = tokenizer
             # Verify the state was set correctly
             if not is_model_loaded(model_name):
                 logger.warning(f"Model {model_name} initialized but is_model_loaded() returns False. State: {get_model_loading_state(model_name)}, in dict: {model_name in config.global_medical_models}")
         except Exception as e:
             set_model_loading_state(model_name, "error")
             logger.error(f"Failed to initialize medical model {model_name}: {e}")
             set_model_loading_state(model_name, "loaded")
     return config.global_medical_models[model_name], config.global_medical_tokenizers[model_name]
+def move_model_to_gpu(model_name: str):
+    """
+    Move a model from CPU to GPU (for ZeroGPU best practices)
+    Must be called within a @spaces.GPU decorated function
+    According to ZeroGPU best practices:
+    - Models should be loaded to CPU first (no GPU quota used)
+    - Models are moved to GPU only during inference (within @spaces.GPU decorated function)
+    """
+    if model_name not in config.global_medical_models:
+        raise ValueError(f"Model {model_name} not found in config")
+    model = config.global_medical_models[model_name]
+    if model is None:
+        raise ValueError(f"Model {model_name} is None")
+    # Check if model is already on GPU
+    try:
+        # For models with device_map, check the actual device
+        if hasattr(model, 'device'):
+            device_str = str(model.device)
+            if 'cuda' in device_str.lower():
+                logger.debug(f"Model {model_name} is already on GPU ({device_str})")
+                return model
+        # Check device_map if available
+        if hasattr(model, 'hf_device_map'):
+            device_map = model.hf_device_map
+            if isinstance(device_map, dict):
+                # Check if any device is GPU
+                if any('cuda' in str(dev).lower() for dev in device_map.values()):
+                    logger.debug(f"Model {model_name} is already on GPU (device_map)")
+                    return model
+    except Exception as e:
+        logger.debug(f"Could not check model device: {e}")
+    # Move model to GPU
+    logger.info(f"Moving model {model_name} from CPU to GPU...")
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    # For models loaded with device_map="cpu", we need to reload with device_map="auto"
+    # or use accelerate to dispatch to GPU
+    try:
+        # Try using accelerate's dispatch_model for proper GPU placement
+        from accelerate import dispatch_model
+        from accelerate.utils import get_balanced_memory, infer_auto_device_map
+        # Get device map for GPU
+        max_memory = get_balanced_memory(model, max_memory={0: "20GiB"})
+        device_map = infer_auto_device_map(model, max_memory=max_memory)
+        model = dispatch_model(model, device_map=device_map)
+        config.global_medical_models[model_name] = model
+        logger.info(f"Model {model_name} moved to GPU successfully using accelerate")
+    except Exception as e:
+        # Fallback: simple move to cuda (may not work for all model architectures)
+        logger.warning(f"Could not use accelerate dispatch, trying simple .to('cuda'): {e}")
+        try:
+            model = model.to('cuda')
+            config.global_medical_models[model_name] = model
+            logger.info(f"Model {model_name} moved to GPU (cuda) successfully")
+        except Exception as e2:
+            logger.error(f"Failed to move model {model_name} to GPU: {e2}")
+            raise
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    return model
 def initialize_tts_model():
     """Initialize TTS model for text-to-speech"""
     if not TTS_AVAILABLE:

pipeline.py CHANGED Viewed

@@ -12,7 +12,7 @@ from llama_index.core import StorageContext, VectorStoreIndex, load_index_from_s
 from llama_index.core import Settings
 from llama_index.core.retrievers import AutoMergingRetriever
 from logger import logger, ThoughtCaptureHandler
-from models import initialize_medical_model, get_or_create_embed_model, is_model_loaded, get_model_loading_state, set_model_loading_state
 from utils import detect_language, translate_text, format_url_as_domain
 from search import search_web, summarize_web_content
 from reasoning import autonomous_reasoning, create_execution_plan, autonomous_execution_strategy
@@ -380,6 +380,33 @@ def stream_chat(
         yield history + [{"role": "assistant", "content": error_msg}], ""
         return
     thought_handler = None
     if show_thoughts:
         thought_handler = ThoughtCaptureHandler()

 from llama_index.core import Settings
 from llama_index.core.retrievers import AutoMergingRetriever
 from logger import logger, ThoughtCaptureHandler
+from models import initialize_medical_model, get_or_create_embed_model, is_model_loaded, get_model_loading_state, set_model_loading_state, move_model_to_gpu
 from utils import detect_language, translate_text, format_url_as_domain
 from search import search_web, summarize_web_content
 from reasoning import autonomous_reasoning, create_execution_plan, autonomous_execution_strategy
         yield history + [{"role": "assistant", "content": error_msg}], ""
         return
+    # ZeroGPU best practice: If model is on CPU, move it to GPU now (we're in a GPU-decorated function)
+    # This ensures the model is ready for inference without consuming GPU quota during startup
+    try:
+        import config
+        if medical_model in config.global_medical_models:
+            model = config.global_medical_models[medical_model]
+            if model is not None:
+                # Check if model is on CPU (device_map="cpu" or device is CPU)
+                model_on_cpu = False
+                if hasattr(model, 'device'):
+                    if str(model.device) == 'cpu':
+                        model_on_cpu = True
+                elif hasattr(model, 'hf_device_map'):
+                    # Model loaded with device_map - check if it's on CPU
+                    if isinstance(model.hf_device_map, dict):
+                        # If all devices are CPU, move to GPU
+                        if all('cpu' in str(dev).lower() for dev in model.hf_device_map.values()):
+                            model_on_cpu = True
+                if model_on_cpu:
+                    logger.info(f"[STREAM_CHAT] Model {medical_model} is on CPU, moving to GPU for inference...")
+                    move_model_to_gpu(medical_model)
+                    logger.info(f"[STREAM_CHAT] ✅ Model {medical_model} moved to GPU successfully")
+    except Exception as e:
+        logger.warning(f"[STREAM_CHAT] Could not move model to GPU (may already be on GPU): {e}")
+        # Continue anyway - model might already be on GPU
     thought_handler = None
     if show_thoughts:
         thought_handler = ThoughtCaptureHandler()

ui.py CHANGED Viewed

@@ -406,10 +406,68 @@ def create_demo():
                     return status_text, is_ready
                 # GPU-decorated function to load ONLY medical model on startup
-                # TTS and Whisper load on-demand to avoid GPU conflicts and reduce startup time
-                @spaces.GPU(max_duration=MAX_DURATION)
-                def load_medical_model_on_startup():
-                    """Load only the default medical model on startup to avoid GPU conflicts"""
                     import torch
                     status_messages = []
@@ -421,14 +479,15 @@ def create_demo():
                         # Load only medical model (MedSwin) - TTS and Whisper load on-demand
                         if not is_model_loaded(DEFAULT_MEDICAL_MODEL):
-                            logger.info(f"[STARTUP] Loading medical model: {DEFAULT_MEDICAL_MODEL}...")
                             set_model_loading_state(DEFAULT_MEDICAL_MODEL, "loading")
                             try:
-                                initialize_medical_model(DEFAULT_MEDICAL_MODEL)
                                 # Verify model is actually loaded
                                 if is_model_loaded(DEFAULT_MEDICAL_MODEL):
-                                    status_messages.append(f"✅ MedSwin ({DEFAULT_MEDICAL_MODEL}): loaded")
-                                    logger.info(f"[STARTUP] ✅ Medical model {DEFAULT_MEDICAL_MODEL} loaded successfully!")
                                 else:
                                     status_messages.append(f"⚠️ MedSwin ({DEFAULT_MEDICAL_MODEL}): loading failed")
                                     logger.warning(f"[STARTUP] Medical model {DEFAULT_MEDICAL_MODEL} initialization completed but not marked as loaded")
@@ -573,15 +632,22 @@ def create_demo():
                 # Load medical model on startup and update status
                 # Use a wrapper to handle GPU context properly with retry logic
                 def load_startup_and_update_ui():
-                    """Load model on startup with retry logic (max 3 attempts) and return status with UI updates"""
                     import time
                     max_retries = 3
                     base_delay = 5.0  # Start with 5 seconds delay
                     for attempt in range(1, max_retries + 1):
                         try:
-                            logger.info(f"[STARTUP] Attempt {attempt}/{max_retries} to load medical model...")
-                            status_text = load_medical_model_on_startup()
                             # Check if model is ready and update submit button state
                             is_ready = is_model_loaded(DEFAULT_MEDICAL_MODEL)
                             if is_ready:

                     return status_text, is_ready
                 # GPU-decorated function to load ONLY medical model on startup
+                # According to ZeroGPU best practices:
+                # 1. Load models to CPU in global scope (no GPU decorator needed)
+                # 2. Move models to GPU only in inference functions (with @spaces.GPU decorator)
+                # However, for large models, loading to CPU then moving to GPU uses more memory
+                # So we use a hybrid approach: load to GPU directly but within GPU-decorated function
+                def load_medical_model_on_startup_cpu():
+                    """
+                    Load model to CPU on startup (ZeroGPU best practice - no GPU decorator needed)
+                    Model will be moved to GPU during first inference
+                    """
+                    status_messages = []
+                    try:
+                        # Load only medical model (MedSwin) to CPU - TTS and Whisper load on-demand
+                        if not is_model_loaded(DEFAULT_MEDICAL_MODEL):
+                            logger.info(f"[STARTUP] Loading medical model to CPU: {DEFAULT_MEDICAL_MODEL}...")
+                            set_model_loading_state(DEFAULT_MEDICAL_MODEL, "loading")
+                            try:
+                                # Load to CPU (no GPU decorator needed)
+                                initialize_medical_model(DEFAULT_MEDICAL_MODEL, load_to_gpu=False)
+                                # Verify model is actually loaded
+                                if is_model_loaded(DEFAULT_MEDICAL_MODEL):
+                                    status_messages.append(f"✅ MedSwin ({DEFAULT_MEDICAL_MODEL}): loaded to CPU")
+                                    logger.info(f"[STARTUP] ✅ Medical model {DEFAULT_MEDICAL_MODEL} loaded to CPU successfully!")
+                                else:
+                                    status_messages.append(f"⚠️ MedSwin ({DEFAULT_MEDICAL_MODEL}): loading failed")
+                                    logger.warning(f"[STARTUP] Medical model {DEFAULT_MEDICAL_MODEL} initialization completed but not marked as loaded")
+                                    set_model_loading_state(DEFAULT_MEDICAL_MODEL, "error")
+                            except Exception as e:
+                                status_messages.append(f"❌ MedSwin ({DEFAULT_MEDICAL_MODEL}): error - {str(e)[:50]}")
+                                logger.error(f"[STARTUP] Failed to load medical model: {e}")
+                                import traceback
+                                logger.debug(f"[STARTUP] Full traceback: {traceback.format_exc()}")
+                                set_model_loading_state(DEFAULT_MEDICAL_MODEL, "error")
+                        else:
+                            status_messages.append(f"✅ MedSwin ({DEFAULT_MEDICAL_MODEL}): already loaded")
+                            logger.info(f"[STARTUP] Medical model {DEFAULT_MEDICAL_MODEL} already loaded")
+                        # Add ASR status (will load on first use)
+                        if WHISPER_AVAILABLE:
+                            status_messages.append("⏳ ASR (Whisper): will load on first use")
+                        else:
+                            status_messages.append("❌ ASR: library not available")
+                        # Return status
+                        status_text = "\n".join(status_messages)
+                        logger.info(f"[STARTUP] ✅ Model loading complete. Status:\n{status_text}")
+                        return status_text
+                    except Exception as e:
+                        error_msg = str(e)
+                        logger.error(f"[STARTUP] Error loading model to CPU: {error_msg}")
+                        return f"⚠️ Error loading model: {error_msg[:100]}"
+                # Alternative: Load directly to GPU (requires GPU decorator)
+                # @spaces.GPU(max_duration=MAX_DURATION)
+                def load_medical_model_on_startup_gpu():
+                    """
+                    Load model directly to GPU on startup (alternative approach)
+                    Uses GPU quota but model is immediately ready for inference
+                    """
                     import torch
                     status_messages = []
                         # Load only medical model (MedSwin) - TTS and Whisper load on-demand
                         if not is_model_loaded(DEFAULT_MEDICAL_MODEL):
+                            logger.info(f"[STARTUP] Loading medical model to GPU: {DEFAULT_MEDICAL_MODEL}...")
                             set_model_loading_state(DEFAULT_MEDICAL_MODEL, "loading")
                             try:
+                                # Load directly to GPU (within GPU-decorated function)
+                                initialize_medical_model(DEFAULT_MEDICAL_MODEL, load_to_gpu=True)
                                 # Verify model is actually loaded
                                 if is_model_loaded(DEFAULT_MEDICAL_MODEL):
+                                    status_messages.append(f"✅ MedSwin ({DEFAULT_MEDICAL_MODEL}): loaded to GPU")
+                                    logger.info(f"[STARTUP] ✅ Medical model {DEFAULT_MEDICAL_MODEL} loaded to GPU successfully!")
                                 else:
                                     status_messages.append(f"⚠️ MedSwin ({DEFAULT_MEDICAL_MODEL}): loading failed")
                                     logger.warning(f"[STARTUP] Medical model {DEFAULT_MEDICAL_MODEL} initialization completed but not marked as loaded")
                 # Load medical model on startup and update status
                 # Use a wrapper to handle GPU context properly with retry logic
                 def load_startup_and_update_ui():
+                    """
+                    Load model on startup with retry logic (max 3 attempts) and return status with UI updates
+                    Uses CPU-first approach (ZeroGPU best practice):
+                    - Load model to CPU (no GPU decorator needed, avoids quota issues)
+                    - Model will be moved to GPU during first inference
+                    """
                     import time
                     max_retries = 3
                     base_delay = 5.0  # Start with 5 seconds delay
                     for attempt in range(1, max_retries + 1):
                         try:
+                            logger.info(f"[STARTUP] Attempt {attempt}/{max_retries} to load medical model to CPU...")
+                            # Use CPU-first approach (no GPU decorator, avoids quota issues)
+                            status_text = load_medical_model_on_startup_cpu()
                             # Check if model is ready and update submit button state
                             is_ready = is_model_loaded(DEFAULT_MEDICAL_MODEL)
                             if is_ready: