Spaces:

MCP-1st-Birthday
/

MedLLM-Agent

Running on Zero

App Files Files Community

Y Phung Nguyen commited on 22 days ago

Commit

4a5418d

1 Parent(s): acc39fd

Fix model preloader

Browse files

Files changed (2) hide show

pipeline.py +4 -14
ui.py +91 -58

pipeline.py CHANGED Viewed

@@ -370,25 +370,15 @@ def stream_chat(
         return
     # Check if model is loaded before proceeding
     if not is_model_loaded(medical_model):
         loading_state = get_model_loading_state(medical_model)
         if loading_state == "loading":
             error_msg = f"⏳ {medical_model} is still loading. Please wait until the model status shows 'loaded and ready' before sending messages."
         else:
-            error_msg = f"⚠️ {medical_model} is not ready. Please wait for the model to finish loading."
-            # Try to load it
-            try:
-                set_model_loading_state(medical_model, "loading")
-                initialize_medical_model(medical_model)
-                # If successful, continue
-            except Exception as e:
-                error_msg = f"⚠️ Error loading {medical_model}: {str(e)[:200]}. Please try again."
-                yield history + [{"role": "assistant", "content": error_msg}], ""
-                return
-        if not is_model_loaded(medical_model):
-            yield history + [{"role": "assistant", "content": error_msg}], ""
-            return
     thought_handler = None
     if show_thoughts:

         return
     # Check if model is loaded before proceeding
+    # NOTE: We don't load the model here to save time - it should be pre-loaded before stream_chat is called
     if not is_model_loaded(medical_model):
         loading_state = get_model_loading_state(medical_model)
         if loading_state == "loading":
             error_msg = f"⏳ {medical_model} is still loading. Please wait until the model status shows 'loaded and ready' before sending messages."
         else:
+            error_msg = f"⚠️ {medical_model} is not loaded. Please wait for the model to finish loading or select a model from the dropdown."
+        yield history + [{"role": "assistant", "content": error_msg}], ""
+        return
     thought_handler = None
     if show_thoughts:

ui.py CHANGED Viewed

@@ -649,71 +649,104 @@ def create_demo():
                     outputs=[model_status, submit_button, message_input]
                 )
-                # Wrap stream_chat - let stream_chat handle model loading since it's GPU-decorated
                 def stream_chat_with_model_check(
                     message, history, system_prompt, temperature, max_new_tokens,
                     top_p, top_k, penalty, retriever_k, merge_threshold,
                     use_rag, medical_model_name, use_web_search,
                     enable_clinical_intake, disable_agentic_reasoning, show_thoughts, request: gr.Request = None
                 ):
-                    import time
-                    max_retries = 2
-                    base_delay = 2.0
-                    for attempt in range(max_retries):
-                        try:
-                            # Check if model is currently loading (don't block if it's already loaded)
-                            loading_state = get_model_loading_state(medical_model_name)
-                            if loading_state == "loading" and not is_model_loaded(medical_model_name):
-                                error_msg = f"⏳ {medical_model_name} is still loading. Please wait until the model status shows 'loaded and ready' before sending messages."
-                                updated_history = history + [{"role": "assistant", "content": error_msg}]
-                                yield updated_history, ""
-                                return
-                            # If request is None, create a mock request for compatibility
-                            if request is None:
-                                class MockRequest:
-                                    session_hash = "anonymous"
-                                request = MockRequest()
-                            # Let stream_chat handle model loading (it's GPU-decorated and can load on-demand)
-                            for result in stream_chat(
-                                message, history, system_prompt, temperature, max_new_tokens,
-                                top_p, top_k, penalty, retriever_k, merge_threshold,
-                                use_rag, medical_model_name, use_web_search,
-                                enable_clinical_intake, disable_agentic_reasoning, show_thoughts, request
-                            ):
-                                yield result
-                                # If we get here, stream_chat completed successfully
-                                return
-                        except Exception as e:
-                            error_msg_lower = str(e).lower()
-                            is_gpu_error = 'gpu task aborted' in error_msg_lower or 'gpu' in error_msg_lower or 'zerogpu' in error_msg_lower
-                        if is_gpu_error and attempt < max_retries - 1:
-                            delay = base_delay * (2 ** attempt)  # Exponential backoff: 2s, 4s
-                            logger.warning(f"[STREAM_CHAT] GPU task aborted (attempt {attempt + 1}/{max_retries}), retrying after {delay}s...")
-                            # Yield a message to user about retry
-                            retry_msg = f"⏳ GPU task was interrupted. Retrying in {delay}s... (attempt {attempt + 1}/{max_retries})"
-                            updated_history = history + [{"role": "assistant", "content": retry_msg}]
-                            yield updated_history, ""
-                            time.sleep(delay)
-                            continue
                         else:
-                            # Final error handling
-                            logger.error(f"[STREAM_CHAT] Error in stream_chat_with_model_check: {e}")
-                            import traceback
-                            logger.error(f"[STREAM_CHAT] Full traceback: {traceback.format_exc()}")
-                            if is_gpu_error:
-                                error_msg = f"⚠️ GPU task was aborted. This can happen if:\n- The request took too long\n- Multiple GPU requests conflicted\n- GPU quota was exceeded\n\nPlease try again or select a different model."
-                            else:
-                                error_msg = f"⚠️ An error occurred: {str(e)[:200]}"
-                    updated_history = history + [{"role": "assistant", "content": error_msg}]
-                    yield updated_history, ""
-                    return
                 submit_button.click(
                     fn=stream_chat_with_model_check,

                     outputs=[model_status, submit_button, message_input]
                 )
+                # Background model loading when user focuses on input (pre-loads before sending message)
+                @spaces.GPU(max_duration=MAX_DURATION)
+                def preload_model_on_input_focus():
+                    """Pre-load model when user focuses on input to avoid loading during stream_chat"""
+                    try:
+                        if not is_model_loaded(DEFAULT_MEDICAL_MODEL):
+                            logger.info("[PRELOAD] User focused on input - pre-loading model in background...")
+                            loading_state = get_model_loading_state(DEFAULT_MEDICAL_MODEL)
+                            if loading_state != "loading":  # Don't start if already loading
+                                try:
+                                    set_model_loading_state(DEFAULT_MEDICAL_MODEL, "loading")
+                                    initialize_medical_model(DEFAULT_MEDICAL_MODEL)
+                                    if is_model_loaded(DEFAULT_MEDICAL_MODEL):
+                                        logger.info("[PRELOAD] ✅ Model pre-loaded successfully!")
+                                        return "✅ Model pre-loaded and ready"
+                                    else:
+                                        logger.warning("[PRELOAD] Model initialization completed but not marked as loaded")
+                                        return "⚠️ Model loading in progress..."
+                                except Exception as e:
+                                    error_msg = str(e)
+                                    is_quota_error = ("429" in error_msg or "Too Many Requests" in error_msg or
+                                                    "quota" in error_msg.lower() or "ZeroGPU" in error_msg or
+                                                    "runnning out" in error_msg.lower() or "running out" in error_msg.lower())
+                                    if is_quota_error:
+                                        logger.warning(f"[PRELOAD] Quota error during pre-load: {error_msg[:100]}")
+                                        return "⚠️ Quota limit - model will load when you send message"
+                                    else:
+                                        logger.error(f"[PRELOAD] Error pre-loading model: {e}")
+                                        return "⚠️ Pre-load failed - will try on message send"
+                            else:
+                                return "⏳ Model is already loading..."
+                        else:
+                            return "✅ Model already loaded"
+                    except Exception as e:
+                        logger.error(f"[PRELOAD] Error in preload function: {e}")
+                        return "⚠️ Pre-load error"
+                def trigger_preload_on_focus():
+                    """Trigger model pre-loading when user focuses on input"""
+                    try:
+                        if not is_model_loaded(DEFAULT_MEDICAL_MODEL):
+                            # Start pre-loading in background (non-blocking)
+                            logger.info("[PRELOAD] Input focused - triggering background model load...")
+                            # This will run in GPU context but won't block the UI
+                            preload_model_on_input_focus()
+                    except Exception as e:
+                        logger.debug(f"[PRELOAD] Pre-load trigger error (non-critical): {e}")
+                    # Return empty string to not update any UI element
+                    return ""
+                # Trigger model pre-loading when user focuses on message input
+                message_input.focus(
+                    fn=trigger_preload_on_focus,
+                    inputs=None,
+                    outputs=None
+                )
+                # Wrap stream_chat - ensure model is loaded before starting (don't load inside stream_chat to save time)
                 def stream_chat_with_model_check(
                     message, history, system_prompt, temperature, max_new_tokens,
                     top_p, top_k, penalty, retriever_k, merge_threshold,
                     use_rag, medical_model_name, use_web_search,
                     enable_clinical_intake, disable_agentic_reasoning, show_thoughts, request: gr.Request = None
                 ):
+                    # Check if model is loaded - if not, show error (don't load here to save stream_chat time)
+                    if not is_model_loaded(medical_model_name):
+                        loading_state = get_model_loading_state(medical_model_name)
+                        if loading_state == "loading":
+                            error_msg = f"⏳ {medical_model_name} is still loading. Please wait until the model status shows 'loaded and ready' before sending messages."
                         else:
+                            error_msg = f"⚠️ {medical_model_name} is not loaded. Please wait a moment for the model to finish loading, or select a model from the dropdown to load it."
+                        updated_history = history + [{"role": "assistant", "content": error_msg}]
+                        yield updated_history, ""
+                        return
+                    # If request is None, create a mock request for compatibility
+                    if request is None:
+                        class MockRequest:
+                            session_hash = "anonymous"
+                        request = MockRequest()
+                    # Model is loaded, proceed with stream_chat (no model loading here to save time)
+                    try:
+                        for result in stream_chat(
+                            message, history, system_prompt, temperature, max_new_tokens,
+                            top_p, top_k, penalty, retriever_k, merge_threshold,
+                            use_rag, medical_model_name, use_web_search,
+                            enable_clinical_intake, disable_agentic_reasoning, show_thoughts, request
+                        ):
+                            yield result
+                    except Exception as e:
+                        # Handle any errors gracefully
+                        logger.error(f"Error in stream_chat_with_model_check: {e}")
+                        import traceback
+                        logger.debug(f"Full traceback: {traceback.format_exc()}")
+                        error_msg = f"⚠️ An error occurred: {str(e)[:200]}"
+                        updated_history = history + [{"role": "assistant", "content": error_msg}]
+                        yield updated_history, ""
                 submit_button.click(
                     fn=stream_chat_with_model_check,