# app.py for your Gradio Space import torch import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig # --- 1. Configuration --- # This is the ID of your LoRA adapter repository on the Hub hub_adapter_id = "Prashasst/Sushruta-P3.8Q" # Make sure this is correct! # Define the quantization configuration for efficient inference quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16, ) # Your system prompt from training SYSTEM_PROMPT = """You are Sushruta-P3.8Q, a specialized medical AI assistant from Prashasst's AI Labs. You were created and fine-tuned by Prashasst Dongre to serve as a reliable and accessible educational tool for the public. Your primary purpose is to help any user understand complex medical topics by providing clear, logical, step-by-step analyses in a ...... format.""" # --- 2. Load the Model and Tokenizer --- print("Loading base model and tokenizer...") # This single command downloads the base Phi-3 model, quantizes it, # and applies your LoRA adapters from the Hub automatically. model = AutoModelForCausalLM.from_pretrained( hub_adapter_id, quantization_config=quantization_config, device_map="auto", trust_remote_code=True, low_cpu_mem_usage=True ) tokenizer = AutoTokenizer.from_pretrained(hub_adapter_id) print("Model loaded successfully!") # --- 3. Define the Prediction Function --- def generate_response(user_question): """ This function takes a user's question, formats the prompt, runs it through the model, and returns the clean response. """ # Format the prompt using the official Phi-3 chat template prompt = f"<|system|>\n{SYSTEM_PROMPT}<|end|>\n<|user|>\n{user_question}<|end|>\n<|assistant|>\n" # Tokenize the input inputs = tokenizer(prompt, return_tensors="pt", return_attention_mask=False).to("cuda") # Generate the response outputs = model.generate(**inputs, max_new_tokens=1024) # Decode and clean up the response response_text = tokenizer.decode(outputs[0], skip_special_tokens=True) # Return only the assistant's part of the response return response_text.split("<|assistant|>")[1] # --- 4. Create the Gradio Interface --- with gr.Blocks(theme='soft') as demo: gr.Markdown("# Sushruta-P3.8Q: Your Medical AI Assistant") gr.Markdown("Created by Prashasst Dongre.") with gr.Row(): question_box = gr.Textbox(label="Enter your medical question here") submit_button = gr.Button("Ask Sushruta") answer_box = gr.Markdown(label="Sushruta's Analysis:") submit_button.click(generate_response, inputs=question_box, outputs=answer_box) # --- 5. Make the Interface Publicly Accessible --- # This makes the Gradio app accessible as a web page and an API demo.queue().launch(share=True)