Spaces:

BinKhoaLe1812
/

Medical-Chatbot

Sleeping

App Files Files Community

LiamKhoaLe commited on Oct 8

Commit

2a31cee

0 Parent(s):

Enhance frontend with search icon

Browse files

Files changed (17) hide show

.gitattributes +35 -0
.gitignore +2 -0
.huggingface.yml +4 -0
Dockerfile +36 -0
README.md +13 -0
app.py +312 -0
chat-history.md +382 -0
clear_mongo.py +48 -0
connect_mongo.py +24 -0
diagnosis.py +76 -0
download_model.py +51 -0
memory.py +426 -0
migrate.py +48 -0
requirements.txt +23 -0
translation.py +26 -0
vlm.py +54 -0
warmup.py +8 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ .env
2	+ secrets.toml

.huggingface.yml ADDED Viewed

	@@ -0,0 +1,4 @@

+sdk: docker
+app_file: app.py
+port: 7860
+hardware: cpu-basic

Dockerfile ADDED Viewed

	@@ -0,0 +1,36 @@

+FROM python:3.11
+# Create and use a non-root user (optional)
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+# Set working directory
+WORKDIR /app
+# Copy all project files to the container
+COPY . .
+# Install dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Set Hugging Face cache directory to persist model downloads
+ENV HF_HOME="/home/user/.cache/huggingface"
+ENV SENTENCE_TRANSFORMERS_HOME="/home/user/.cache/huggingface/sentence-transformers"
+ENV MEDGEMMA_HOME="/home/user/.cache/huggingface/sentence-transformers"
+# Create cache directories and ensure permissions
+RUN mkdir -p /app/model_cache /home/user/.cache/huggingface/sentence-transformers && \
+    chown -R user:user /app/model_cache /home/user/.cache/huggingface
+# Pre-load model in a separate script
+RUN python /app/download_model.py && python /app/warmup.py
+# Ensure ownership and permissions remain intact
+RUN chown -R user:user /app/model_cache
+# Expose port
+EXPOSE 7860
+# Run the application
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]

README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+---
+title: Medical Chatbot
+emoji: 🤖🩺
+colorFrom: blue
+colorTo: purple
+sdk: docker
+sdk_version: latest
+pinned: false
+license: apache-2.0
+short_description: MedicalChatbot, FAISS, Gemini, MongoDB vDB, LRU
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,312 @@

+# app.py
+import os
+import faiss
+import numpy as np
+import time
+import uvicorn
+from fastapi import FastAPI, Request
+from fastapi.responses import JSONResponse
+from pymongo import MongoClient
+from google import genai
+from sentence_transformers import SentenceTransformer
+from sentence_transformers.util import cos_sim
+from memory import MemoryManager
+from translation import translate_query
+from vlm import process_medical_image
+# ✅ Enable Logging for Debugging
+import logging
+# —————— Silence Noisy Loggers ——————
+for name in [
+    "uvicorn.error", "uvicorn.access",
+    "fastapi", "starlette",
+    "pymongo", "gridfs",
+    "sentence_transformers", "faiss",
+    "google", "google.auth",
+]:
+    logging.getLogger(name).setLevel(logging.WARNING)
+logging.basicConfig(level=logging.INFO, format="%(asctime)s — %(name)s — %(levelname)s — %(message)s", force=True) # Change INFO to DEBUG for full-ctx JSON loader
+logger = logging.getLogger("medical-chatbot")
+logger.setLevel(logging.DEBUG)
+# Debug Start
+logger.info("🚀 Starting Medical Chatbot API...")
+# ✅ Environment Variables
+mongo_uri = os.getenv("MONGO_URI")
+index_uri = os.getenv("INDEX_URI")
+gemini_flash_api_key = os.getenv("FlashAPI")
+# Validate environment endpoint
+if not all([gemini_flash_api_key, mongo_uri, index_uri]):
+    raise ValueError("❌ Missing API keys! Set them in Hugging Face Secrets.")
+# logger.info(f"🔎 MongoDB URI: {mongo_uri}")
+# logger.info(f"🔎 FAISS Index URI: {index_uri}")
+# ✅ Monitor Resources Before Startup
+import psutil
+def check_system_resources():
+    memory = psutil.virtual_memory()
+    cpu = psutil.cpu_percent(interval=1)
+    disk = psutil.disk_usage("/")
+    # Defines log info messages
+    logger.info(f"[System] 🔍 System Resources - RAM: {memory.percent}%, CPU: {cpu}%, Disk: {disk.percent}%")
+    if memory.percent > 85:
+        logger.warning("⚠️ High RAM usage detected!")
+    if cpu > 90:
+        logger.warning("⚠️ High CPU usage detected!")
+    if disk.percent > 90:
+        logger.warning("⚠️ High Disk usage detected!")
+check_system_resources()
+# ✅ Reduce Memory usage with optimizers
+os.environ["OMP_NUM_THREADS"] = "1"
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+# ✅ Initialize FastAPI app
+app = FastAPI(title="Medical Chatbot API")
+memory = MemoryManager()
+from fastapi.middleware.cors import CORSMiddleware # Bypassing CORS origin
+# Define the origins
+origins = [
+    "http://localhost:5173",                    # Vite dev server
+    "http://localhost:3000",                    # Another vercel local dev
+    "https://medical-chatbot-henna.vercel.app", # ✅ Vercel frontend production URL
+]
+# Add the CORS middleware:
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=origins,   # or ["*"] to allow all
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# ✅ Use Lazy Loading for FAISS Index
+index = None  # Delay FAISS Index loading until first query
+# ✅ Load SentenceTransformer Model (Quantized/Halved)
+logger.info("[Embedder] 📥 Loading SentenceTransformer Model...")
+MODEL_CACHE_DIR = "/app/model_cache"
+try:
+    embedding_model = SentenceTransformer(MODEL_CACHE_DIR, device="cpu")
+    embedding_model = embedding_model.half()  # Reduce memory
+    logger.info("✅ Model Loaded Successfully.")
+except Exception as e:
+    logger.error(f"❌ Model Loading Failed: {e}")
+    exit(1)
+# Cache in-memory vectors (optional — useful for <10k rows)
+SYMPTOM_VECTORS = None
+SYMPTOM_DOCS = None
+# ✅ Setup MongoDB Connection
+# QA data
+client = MongoClient(mongo_uri)
+db = client["MedicalChatbotDB"]
+qa_collection = db["qa_data"]
+# FAISS Index data
+iclient = MongoClient(index_uri)
+idb = iclient["MedicalChatbotDB"]
+index_collection = idb["faiss_index_files"]
+# Symptom Diagnosis data
+symptom_client = MongoClient(mongo_uri)
+symptom_col = symptom_client["MedicalChatbotDB"]["symptom_diagnosis"]
+# ✅ Load FAISS Index (Lazy Load)
+import gridfs
+fs = gridfs.GridFS(idb, collection="faiss_index_files")
+def load_faiss_index():
+    global index
+    if index is None:
+        logger.info("[KB] ⏳ Loading FAISS index from GridFS...")
+        existing_file = fs.find_one({"filename": "faiss_index.bin"})
+        if existing_file:
+            stored_index_bytes = existing_file.read()
+            index_bytes_np = np.frombuffer(stored_index_bytes, dtype='uint8')
+            index = faiss.deserialize_index(index_bytes_np)
+            logger.info("[KB] ✅ FAISS Index Loaded")
+        else:
+            logger.error("[KB] ❌ FAISS index not found in GridFS.")
+    return index
+# ✅ Retrieve Medical Info (256,916 scenario)
+def retrieve_medical_info(query, k=5, min_sim=0.9): # Min similarity between query and kb is to be 80%
+    global index
+    index = load_faiss_index()
+    if index is None:
+        return [""]
+    # Embed query
+    query_vec = embedding_model.encode([query], convert_to_numpy=True)
+    D, I = index.search(query_vec, k=k)
+    # Filter by cosine threshold
+    results = []
+    kept = []
+    kept_vecs = []
+    # Smart dedup on cosine threshold between similar candidates
+    for score, idx in zip(D[0], I[0]):
+        if score < min_sim:
+            continue
+        # List sim docs
+        doc = qa_collection.find_one({"i": int(idx)})
+        if not doc:
+            continue
+        # Only compare answers
+        answer = doc.get("Doctor", "").strip()
+        if not answer:
+            continue
+        # Check semantic redundancy among previously kept results
+        new_vec = embedding_model.encode([answer], convert_to_numpy=True)[0]
+        is_similar = False
+        for i, vec in enumerate(kept_vecs):
+            sim = np.dot(vec, new_vec) / (np.linalg.norm(vec) * np.linalg.norm(new_vec) + 1e-9)
+            if sim >= 0.9:  # High semantic similarity
+                is_similar = True
+                # Keep only better match to original query
+                cur_sim_to_query = np.dot(vec, query_vec[0]) / (np.linalg.norm(vec) * np.linalg.norm(query_vec[0]) + 1e-9)
+                new_sim_to_query = np.dot(new_vec, query_vec[0]) / (np.linalg.norm(new_vec) * np.linalg.norm(query_vec[0]) + 1e-9)
+                if new_sim_to_query > cur_sim_to_query:
+                    kept[i] = answer
+                    kept_vecs[i] = new_vec
+                break
+        # Non-similar candidates
+        if not is_similar:
+            kept.append(answer)
+            kept_vecs.append(new_vec)
+    # Final
+    return kept if kept else [""]
+# ✅ Retrieve Sym-Dia Info (4,962 scenario)
+def retrieve_diagnosis_from_symptoms(symptom_text, top_k=5, min_sim=0.5):
+    global SYMPTOM_VECTORS, SYMPTOM_DOCS
+    # Lazy load
+    if SYMPTOM_VECTORS is None:
+        all_docs = list(symptom_col.find({}, {"embedding": 1, "answer": 1, "question": 1, "prognosis": 1}))
+        SYMPTOM_DOCS = all_docs
+        SYMPTOM_VECTORS = np.array([doc["embedding"] for doc in all_docs], dtype=np.float32)
+    # Embed input
+    qvec = embedding_model.encode(symptom_text, convert_to_numpy=True)
+    qvec = qvec / (np.linalg.norm(qvec) + 1e-9)
+    # Similarity compute
+    sims = SYMPTOM_VECTORS @ qvec  # cosine
+    sorted_idx = np.argsort(sims)[-top_k:][::-1]
+    seen_diag = set()
+    final = [] # Dedup
+    for i in sorted_idx:
+        sim = sims[i]
+        if sim < min_sim:
+            continue
+        label = SYMPTOM_DOCS[i]["prognosis"]
+        if label not in seen_diag:
+            final.append(SYMPTOM_DOCS[i]["answer"])
+            seen_diag.add(label)
+    return final
+# ✅ Gemini Flash API Call
+def gemini_flash_completion(prompt, model, temperature=0.7):
+    client_genai = genai.Client(api_key=gemini_flash_api_key)
+    try:
+        response = client_genai.models.generate_content(model=model, contents=prompt)
+        return response.text
+    except Exception as e:
+        logger.error(f"[LLM] ❌ Error calling Gemini API: {e}")
+        return "Error generating response from Gemini."
+# ✅ Chatbot Class
+class RAGMedicalChatbot:
+    def __init__(self, model_name, retrieve_function):
+        self.model_name = model_name
+        self.retrieve = retrieve_function
+    def chat(self, user_id: str, user_query: str, lang: str = "EN", image_diagnosis: str = "") -> str:
+        # 0. Translate query if not EN, this help our RAG system
+        if lang.upper() in {"VI", "ZH"}:
+            user_query = translate_query(user_query, lang.lower())
+        # 1. Fetch knowledge
+        ## a. KB for generic QA retrieval
+        retrieved_info = self.retrieve(user_query)
+        knowledge_base = "\n".join(retrieved_info)
+        ## b. Diagnosis RAG from symptom query
+        diagnosis_guides = retrieve_diagnosis_from_symptoms(user_query)  # smart matcher
+        # 2. Hybrid Context Retrieval: RAG + Recent History + Intelligent Selection
+        contextual_chunks = memory.get_contextual_chunks(user_id, user_query, lang)
+        # 3. Build prompt parts
+        parts = ["You are a medical chatbot, designed to answer medical questions."]
+        parts.append("Please format your answer using MarkDown.")
+        parts.append("**Bold for titles**, *italic for emphasis*, and clear headings.")
+        # 4. Append image diagnosis from VLM
+        if image_diagnosis:
+            parts.append(
+                "A user medical image is diagnosed by our VLM agent:\n"
+                f"{image_diagnosis}\n\n"
+                "Please incorporate the above findings in your response if medically relevant.\n\n"
+            )
+        # Append contextual chunks from hybrid approach
+        if contextual_chunks:
+            parts.append("Relevant context from conversation history:\n" + contextual_chunks)
+        # Load up guideline (RAG over medical knowledge base)
+        if knowledge_base:
+            parts.append(f"Example Q&A medical scenario knowledge-base: {knowledge_base}")
+        # Symptom-Diagnosis prediction RAG
+        if diagnosis_guides:
+            parts.append("Symptom-based diagnosis guidance (if applicable):\n" + "\n".join(diagnosis_guides))
+        parts.append(f"User's question: {user_query}")
+        parts.append(f"Language to generate answer: {lang}")
+        prompt = "\n\n".join(parts)
+        logger.info(f"[LLM] Question query in `prompt`: {prompt}") # Debug out checking RAG on kb and history
+        response = gemini_flash_completion(prompt, model=self.model_name, temperature=0.7)
+         # Store exchange + chunking
+        if user_id:
+            memory.add_exchange(user_id, user_query, response, lang=lang)
+        logger.info(f"[LLM] Response on `prompt`: {response.strip()}") # Debug out base response
+        return response.strip()
+# ✅ Initialize Chatbot
+chatbot = RAGMedicalChatbot(model_name="gemini-2.5-flash", retrieve_function=retrieve_medical_info)
+# ✅ Chat Endpoint
+@app.post("/chat")
+async def chat_endpoint(req: Request):
+    body = await req.json()
+    user_id = body.get("user_id", "anonymous")
+    query_raw = body.get("query")
+    query = query_raw.strip() if isinstance(query_raw, str) else ""
+    lang    = body.get("lang", "EN")
+    image_base64 = body.get("image_base64", None)
+    img_desc = body.get("img_desc", "Describe and investigate any clinical findings from this medical image.")
+    start = time.time()
+    image_diagnosis = ""
+    # LLM Only
+    if not image_base64:
+        logger.info("[BOT] LLM scenario.")
+    # LLM+VLM
+    else:
+        # If image is present → diagnose first
+        safe_load = len(image_base64.encode("utf-8"))
+        if safe_load > 5_000_000: # Img size safe processor
+            return JSONResponse({"response": "⚠️ Image too large. Please upload smaller images (<5MB)."})
+        logger.info("[BOT] VLM+LLM scenario.")
+        logger.info(f"[VLM] Process medical image size: {safe_load}, desc: {img_desc}, {lang}.")
+        image_diagnosis = process_medical_image(image_base64, img_desc, lang)
+    answer = chatbot.chat(user_id, query, lang, image_diagnosis)
+    elapsed = time.time() - start
+    # Final
+    return JSONResponse({"response": f"{answer}\n\n(Response time: {elapsed:.2f}s)"})
+# ✅ Run Uvicorn
+if __name__ == "__main__":
+    logger.info("[System] ✅ Starting FastAPI Server...")
+    try:
+        uvicorn.run(app, host="0.0.0.0", port=7860, log_level="debug")
+    except Exception as e:
+        logger.error(f"❌ Server Startup Failed: {e}")
+        exit(1)

chat-history.md ADDED Viewed

	@@ -0,0 +1,382 @@

+# 🔄 Enhanced Memory System: STM + LTM + Hybrid Context Retrieval
+## Overview
+The Medical Chatbot now implements an **advanced memory system** with **Short-Term Memory (STM)** and **Long-Term Memory (LTM)** that intelligently manages conversation context, semantic knowledge, and conversational continuity. This system goes beyond simple RAG to provide truly intelligent, contextually aware responses that remember and build upon previous interactions.
+## 🏗️ Architecture
+### Memory Hierarchy
+```
+User Query → Enhanced Memory System → Intelligent Context Selection → LLM Response
+                ↓
+        ┌─────────────────┬─────────────────┬─────────────────┐
+        │   STM (5 items) │   LTM (60 items)│   RAG Search    │
+        │ (Recent Summaries)│ (Semantic Store)│ (Knowledge Base)│
+        └─────────────────┴─────────────────┴─────────────────┘
+                ↓
+        Gemini Flash Lite Contextual Analysis
+                ↓
+        Summarized Context + Semantic Knowledge
+```
+### Memory Types
+#### 1. **Short-Term Memory (STM)**
+- **Capacity:** 5 recent conversation summaries
+- **Content:** Chunked and summarized LLM responses with enriched topics
+- **Features:** Semantic deduplication, intelligent merging, topic enrichment
+- **Purpose:** Maintain conversational continuity and immediate context
+#### 2. **Long-Term Memory (LTM)**
+- **Capacity:** 60 semantic chunks (~20 conversational rounds)
+- **Content:** FAISS-indexed medical knowledge chunks
+- **Features:** Semantic similarity search, usage tracking, smart eviction
+- **Purpose:** Provide deep medical knowledge and historical context
+#### 3. **RAG Knowledge Base**
+- **Content:** External medical knowledge and guidelines
+- **Features:** Real-time retrieval, semantic matching
+- **Purpose:** Supplement with current medical information
+## 🔧 Key Components
+### 1. Enhanced Memory Manager (`memory.py`)
+#### STM Management
+```python
+def get_recent_chat_history(self, user_id: str, num_turns: int = 5) -> List[Dict]:
+    """
+    Get the most recent STM summaries (not raw Q/A).
+    Returns: [{"user": "", "bot": "Topic: ...\n<summary>", "timestamp": time}, ...]
+    """
+```
+**STM Features:**
+- **Capacity:** 5 recent conversation summaries
+- **Content:** Chunked and summarized LLM responses with enriched topics
+- **Deduplication:** Semantic similarity-based merging (≥0.92 identical, ≥0.75 merge)
+- **Topic Enrichment:** Uses user question context to generate detailed topics
+#### LTM Management
+```python
+def get_relevant_chunks(self, user_id: str, query: str, top_k: int = 3, min_sim: float = 0.30) -> List[str]:
+    """Return texts of chunks whose cosine similarity ≥ min_sim."""
+```
+**LTM Features:**
+- **Capacity:** 60 semantic chunks (~20 conversational rounds)
+- **Indexing:** FAISS-based semantic search
+- **Smart Eviction:** Usage-based decay and recency scoring
+- **Merging:** Intelligent deduplication and content fusion
+#### Enhanced Chunking
+```python
+def chunk_response(self, response: str, lang: str, question: str = "") -> List[Dict]:
+    """
+    Enhanced chunking with question context for richer topics.
+    Returns: [{"tag": "detailed_topic", "text": "summary"}, ...]
+    """
+```
+**Chunking Features:**
+- **Question Context:** Incorporates user's latest question for topic generation
+- **Rich Topics:** Detailed topics (10-20 words) capturing context, condition, and action
+- **Medical Focus:** Excludes disclaimers, includes exact medication names/doses
+- **Semantic Grouping:** Groups by medical topic, symptom, assessment, plan, or instruction
+### 2. Intelligent Context Retrieval
+#### Contextual Summarization
+```python
+def get_contextual_chunks(self, user_id: str, current_query: str, lang: str = "EN") -> str:
+    """
+    Creates a single, coherent summary from STM + LTM + RAG.
+    Returns: A single summary string for the main LLM.
+    """
+```
+**Features:**
+- **Unified Summary:** Combines STM (5 turns) + LTM (semantic) + RAG (knowledge)
+- **Gemini Analysis:** Uses Gemini Flash Lite for intelligent context selection
+- **Conversational Flow:** Maintains continuity while providing medical relevance
+- **Fallback Strategy:** Graceful degradation if analysis fails
+## 🚀 How It Works
+### Step 1: Enhanced Memory Processing
+```python
+# Process new exchange through STM and LTM
+chunks = memory.chunk_response(response, lang, question=query)
+for chunk in chunks:
+    memory._upsert_stm(user_id, chunk, lang)  # STM with dedupe/merge
+memory._upsert_ltm(user_id, chunks, lang)     # LTM with semantic storage
+```
+### Step 2: Context Retrieval
+```python
+# Get STM summaries (5 recent turns)
+recent_history = memory.get_recent_chat_history(user_id, num_turns=5)
+# Get LTM semantic chunks
+rag_chunks = memory.get_relevant_chunks(user_id, current_query, top_k=3)
+# Get external RAG knowledge
+external_rag = retrieve_medical_info(current_query)
+```
+### Step 3: Intelligent Context Summarization
+The system sends all context sources to Gemini Flash Lite for unified summarization:
+```
+You are a medical assistant creating a concise summary of conversation context for continuity.
+Current user query: "{current_query}"
+Available context information:
+Recent conversation history:
+{recent_history}
+Semantically relevant historical medical information:
+{rag_chunks}
+Task: Create a brief, coherent summary that captures the key points from the conversation history and relevant medical information that are important for understanding the current query.
+Guidelines:
+1. Focus on medical symptoms, diagnoses, treatments, or recommendations mentioned
+2. Include any patient concerns or questions that are still relevant
+3. Highlight any follow-up needs or pending clarifications
+4. Keep the summary concise but comprehensive enough for context
+5. Maintain conversational flow and continuity
+Output: Provide a single, well-structured summary paragraph that can be used as context for the main LLM to provide a coherent response.
+```
+### Step 4: Unified Context Integration
+The single, coherent summary is integrated into the main LLM prompt, providing:
+- **Conversational continuity** (from STM summaries)
+- **Medical knowledge** (from LTM semantic chunks)
+- **Current information** (from external RAG)
+- **Unified narrative** (single summary instead of multiple chunks)
+## 📊 Benefits
+### 1. **Advanced Memory Management**
+- **STM:** Maintains 5 recent conversation summaries with intelligent deduplication
+- **LTM:** Stores 60 semantic chunks (~20 rounds) with FAISS indexing
+- **Smart Merging:** Combines similar content while preserving unique details
+- **Topic Enrichment:** Detailed topics using user question context
+### 2. **Intelligent Context Summarization**
+- **Unified Summary:** Single coherent narrative instead of multiple chunks
+- **Gemini Analysis:** AI-powered context selection and summarization
+- **Medical Focus:** Prioritizes symptoms, diagnoses, treatments, and recommendations
+- **Conversational Flow:** Maintains natural dialogue continuity
+### 3. **Enhanced Chunking & Topics**
+- **Question Context:** Incorporates user's latest question for richer topics
+- **Detailed Topics:** 10-20 word descriptions capturing context, condition, and action
+- **Medical Precision:** Includes exact medication names, doses, and clinical instructions
+- **Semantic Grouping:** Organizes by medical topic, symptom, assessment, plan, or instruction
+### 4. **Robust Fallback Strategy**
+- **Primary:** Gemini Flash Lite contextual summarization
+- **Secondary:** LTM semantic search with usage-based scoring
+- **Tertiary:** STM recent summaries
+- **Final:** External RAG knowledge base
+### 5. **Performance & Scalability**
+- **Efficient Storage:** Semantic deduplication reduces memory footprint
+- **Fast Retrieval:** FAISS indexing for sub-millisecond LTM search
+- **Smart Eviction:** Usage-based decay and recency scoring
+- **Minimal Latency:** Optimized for real-time medical consultations
+## 🧪 Example Scenarios
+### Scenario 1: STM Deduplication & Merging
+```
+User: "I have chest pain"
+Bot: "This could be angina. Symptoms include pressure, tightness, and shortness of breath."
+User: "What about chest pain with shortness of breath?"
+Bot: "Chest pain with shortness of breath is concerning for angina or heart attack..."
+User: "Tell me more about the symptoms"
+Bot: "Angina symptoms include chest pressure, tightness, shortness of breath, and may radiate to arms..."
+```
+**Result:** STM merges similar responses, creating a comprehensive summary: "Patient has chest pain symptoms consistent with angina, including pressure, tightness, shortness of breath, and potential radiation to arms. This represents a concerning cardiac presentation requiring immediate evaluation."
+### Scenario 2: LTM Semantic Retrieval
+```
+User: "What medications should I avoid with my condition?"
+Bot: "Based on your previous discussion about hypertension and the medications mentioned..."
+```
+**Result:** LTM retrieves relevant medical information about hypertension medications and contraindications from previous conversations, even if not in recent STM.
+### Scenario 3: Enhanced Topic Generation
+```
+User: "I'm having trouble sleeping"
+Bot: "Topic: Sleep disturbance evaluation and management for adult patient with insomnia symptoms"
+```
+**Result:** The topic incorporates the user's question context to create a detailed, medical-specific description instead of just "Sleep problems."
+### Scenario 4: Unified Context Summarization
+```
+User: "Can you repeat the treatment plan?"
+Bot: "Based on our conversation about your hypertension and sleep issues, your treatment plan includes..."
+```
+**Result:** The system creates a unified summary combining STM (recent sleep discussion), LTM (hypertension history), and RAG (current treatment guidelines) into a single coherent narrative.
+## ⚙️ Configuration
+### Environment Variables
+```bash
+FlashAPI=your_gemini_api_key  # For both main LLM and contextual analysis
+```
+### Enhanced Memory Settings
+```python
+memory = MemoryManager(
+    max_users=1000,           # Maximum users in memory
+    history_per_user=5,       # STM capacity (5 recent summaries)
+    max_chunks=60             # LTM capacity (~20 conversational rounds)
+)
+```
+### Memory Parameters
+```python
+# STM retrieval (5 recent turns)
+recent_history = memory.get_recent_chat_history(user_id, num_turns=5)
+# LTM semantic search
+rag_chunks = memory.get_relevant_chunks(user_id, query, top_k=3, min_sim=0.30)
+# Unified context summarization
+contextual_summary = memory.get_contextual_chunks(user_id, current_query, lang)
+```
+### Similarity Thresholds
+```python
+# STM deduplication thresholds
+IDENTICAL_THRESHOLD = 0.92    # Replace older with newer
+MERGE_THRESHOLD = 0.75        # Merge similar content
+# LTM semantic search
+MIN_SIMILARITY = 0.30         # Minimum similarity for retrieval
+TOP_K = 3                     # Number of chunks to retrieve
+```
+## 🔍 Monitoring & Debugging
+### Enhanced Logging
+The system provides comprehensive logging for all memory operations:
+```python
+# STM operations
+logger.info(f"[Contextual] Retrieved {len(recent_history)} recent history items")
+logger.info(f"[Contextual] Retrieved {len(rag_chunks)} RAG chunks")
+# Chunking operations
+logger.info(f"[Memory] 📦 Gemini summarized chunk output: {output}")
+logger.warning(f"[Memory] ❌ Gemini chunking failed: {e}")
+# Contextual summarization
+logger.info(f"[Contextual] Gemini created summary: {summary[:100]}...")
+logger.warning(f"[Contextual] Gemini summarization failed: {e}")
+```
+### Performance Metrics
+- **STM Operations:** Deduplication rate, merge frequency, topic enrichment quality
+- **LTM Operations:** FAISS search latency, semantic similarity scores, eviction patterns
+- **Context Summarization:** Gemini response time, summary quality, fallback usage
+- **Memory Usage:** Storage efficiency, retrieval hit rates, cache performance
+## 🚨 Error Handling
+### Enhanced Fallback Strategy
+1. **Primary:** Gemini Flash Lite contextual summarization
+2. **Secondary:** LTM semantic search with usage-based scoring
+3. **Tertiary:** STM recent summaries
+4. **Final:** External RAG knowledge base
+5. **Emergency:** No context (minimal response)
+### Error Scenarios & Recovery
+- **Gemini API failure** → Fall back to LTM semantic search
+- **LTM corruption** → Rebuild FAISS index from remaining chunks
+- **STM corruption** → Reset to empty STM, continue with LTM
+- **Memory corruption** → Reset user session, clear all memory
+- **Chunking failure** → Store raw response as fallback chunk
+## 🔮 Future Enhancements
+### 1. **Persistent Memory Storage**
+- **Database Integration:** Store LTM in PostgreSQL/SQLite with FAISS index persistence
+- **Session Recovery:** Resume conversations after system restarts
+- **Memory Export:** Allow users to export their conversation history
+- **Cross-device Sync:** Synchronize memory across different devices
+### 2. **Advanced Memory Features**
+- **Fact Store:** Dedicated storage for critical medical facts (allergies, chronic conditions, medications)
+- **Memory Compression:** Summarize older STM entries into LTM when STM overflows
+- **Contextual Tags:** Add metadata tags (encounter type, modality, urgency) to bias retrieval
+- **Memory Analytics:** Track memory usage patterns and optimize storage strategies
+### 3. **Intelligent Memory Management**
+- **Adaptive Thresholds:** Dynamically adjust similarity thresholds based on conversation context
+- **Memory Prioritization:** Protect critical medical information from eviction
+- **Usage-based Retention:** Keep frequently accessed information longer
+- **Semantic Clustering:** Group related memories for better organization
+### 4. **Enhanced Medical Context**
+- **Clinical Decision Support:** Integrate with medical guidelines and protocols
+- **Risk Assessment:** Track and alert on potential medical risks across conversations
+- **Medication Reconciliation:** Maintain accurate medication lists across sessions
+- **Follow-up Scheduling:** Track recommended follow-ups and reminders
+### 5. **Multi-modal Memory**
+- **Image Memory:** Store and retrieve medical images with descriptions
+- **Voice Memory:** Convert voice interactions to text for memory storage
+- **Document Memory:** Process and store medical documents and reports
+- **Temporal Memory:** Track changes in symptoms and conditions over time
+## 📝 Testing
+### Memory System Testing
+```bash
+cd Medical-Chatbot
+python test_memory_system.py
+```
+### Test Scenarios
+1. **STM Deduplication Test:** Verify similar responses are merged correctly
+2. **LTM Semantic Search Test:** Test FAISS retrieval with various queries
+3. **Context Summarization Test:** Validate unified summary generation
+4. **Topic Enrichment Test:** Check detailed topic generation with question context
+5. **Memory Capacity Test:** Verify STM (5 items) and LTM (60 items) limits
+6. **Fallback Strategy Test:** Test system behavior when Gemini API fails
+### Expected Behaviors
+- **STM:** Similar responses merge, unique details preserved
+- **LTM:** Semantic search returns relevant chunks with usage tracking
+- **Topics:** Detailed, medical-specific descriptions (10-20 words)
+- **Summaries:** Coherent narratives combining STM + LTM + RAG
+- **Performance:** Sub-second retrieval times for all operations
+## 🎯 Summary
+The enhanced memory system transforms the Medical Chatbot into a sophisticated, memory-aware medical assistant that:
+✅ **Maintains Short-Term Memory (STM)** with 5 recent conversation summaries and intelligent deduplication
+✅ **Provides Long-Term Memory (LTM)** with 60 semantic chunks and FAISS-based retrieval
+✅ **Generates Enhanced Topics** using question context for detailed, medical-specific descriptions
+✅ **Creates Unified Summaries** combining STM + LTM + RAG into coherent narratives
+✅ **Implements Smart Merging** that preserves unique details while eliminating redundancy
+✅ **Ensures Conversational Continuity** across extended medical consultations
+✅ **Optimizes Performance** with sub-second retrieval and efficient memory management
+This advanced memory system addresses the limitations of simple RAG systems by providing:
+- **Intelligent context management** that remembers and builds upon previous interactions
+- **Medical precision** with detailed topics and exact clinical information
+- **Scalable architecture** that can handle extended conversations without performance degradation
+- **Robust fallback strategies** ensuring system reliability in all scenarios
+The result is a medical chatbot that truly understands conversation context, remembers patient history, and provides increasingly relevant and personalized medical guidance over time.

clear_mongo.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from pymongo import MongoClient
+from dotenv import load_dotenv
+import os
+# # Load environment variables from .env
+# load_dotenv()
+##-------------##
+# FOR QA CLUSTER
+##-------------##
+# mongo_uri = os.getenv("MONGO_URI")
+# if not mongo_uri:
+#     raise ValueError("❌ MongoDB URI (MongoURI) is missing!")
+# client = MongoClient(mongo_uri)
+# db = client["MedicalChatbotDB"]  # Use the same database name as in your main script
+# # To drop just the collection storing the FAISS index:
+# db.drop_collection("qa_data")
+# print("Dropped collection 'qa_data' from MedicalChatbotDB.")
+# # Alternatively, to drop the entire database:
+# client.drop_database("MedicalChatbotDB")
+# print("Dropped database 'MedicalChatbotDB'.")
+##-------------##
+# FOR INDEX CLUSTER
+##-------------##
+# Load environment variables from .env
+# load_dotenv()
+# index_uri = os.getenv("INDEX_URI")
+# if not index_uri:
+#     raise ValueError("❌ MongoDB URI (IndexURI) is missing!")
+# iclient = MongoClient(index_uri)
+# idb = iclient["MedicalChatbotDB"]  # Use the same database name as in your main script
+# # To drop just the collection storing the FAISS index:
+# idb.drop_collection("faiss_index_files.files")
+# idb.drop_collection("faiss_index_files.chunks")
+# print("Dropped collection 'faiss_index_files' and chunks from MedicalChatbotDB.")
+# # Alternatively, to drop the entire database:
+# iclient.drop_database("MedicalChatbotDB")
+# print("Dropped database 'MedicalChatbotDB'.")

connect_mongo.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from pymongo import MongoClient
+from dotenv import load_dotenv
+import os
+# Test MongoDB connection, and list out all collection.
+load_dotenv()
+# QA Cluster
+mongo_uri = os.getenv("MONGO_URI")
+client = MongoClient(mongo_uri)
+db = client["MedicalChatbotDB"]
+# List all collection
+print("QA Collection: ",db.list_collection_names())
+# Count document QA related
+print("QA count: ", db.qa_data.count_documents({}))
+# Index Cluster
+index_uri = os.getenv("INDEX_URI")
+iclient = MongoClient(index_uri)
+idb = iclient["MedicalChatbotDB"]
+# List all collection
+print("FAISS Collection: ",idb.list_collection_names())
+# Count document QA related
+print("Index count: ", idb.faiss_index_files.files.count_documents({}))

diagnosis.py ADDED Viewed

	@@ -0,0 +1,76 @@

+# ✅ Google Colab: SymbiPredict Embedding + Chunking + MongoDB Upload
+import pandas as pd
+import numpy as np
+from sentence_transformers import SentenceTransformer
+from pymongo import MongoClient
+from pymongo.errors import BulkWriteError
+import hashlib, os
+from tqdm import tqdm
+# ✅ Load model
+model = SentenceTransformer("all-MiniLM-L6-v2")
+# ✅ Load SymbiPredict
+df = pd.read_csv("symbipredict_2022.csv")
+# ✅ Connect to MongoDB
+mongo_uri = "..."
+client = MongoClient(mongo_uri)
+db = client["MedicalChatbotDB"]
+collection = db["symptom_diagnosis"]
+# ✅ Clear old symptom-diagnosis records
+print("🧹 Dropping old 'symptom_diagnosis' collection...")
+collection.drop()
+#  Reconfirm collection is empty
+if collection.count_documents({}) != 0:
+    raise RuntimeError("❌ Collection not empty after drop — aborting!")
+# ✅ Convert CSV rows into QA-style records with embeddings
+records = []
+for i, row in tqdm(df.iterrows(), total=len(df)):
+    symptom_cols = df.columns[:-1]
+    label_col = df.columns[-1]
+    # Extract symptoms present (value==1)
+    symptoms = [col.replace("_", " ").strip() for col in symptom_cols if row[col] == 1]
+    if not symptoms:
+        continue
+    label = row[label_col].strip()
+    question = f"What disease is likely given these symptoms: {', '.join(symptoms)}?"
+    answer = f"The patient is likely suffering from: {label}."
+    # Embed question only
+    embed = model.encode(question, convert_to_numpy=True)
+    hashkey = hashlib.md5((question + answer).encode()).hexdigest()
+    records.append({
+        "_id": hashkey,
+        "i": int(i),
+        "symptoms": symptoms,
+        "prognosis": label,
+        "question": question,
+        "answer": answer,
+        "embedding": embed.tolist()
+    })
+# ✅ Save to MongoDB
+if records:
+    print(f"⬆️ Uploading {len(records)} records to MongoDB...")
+    unique_ids = set()
+    deduped = []
+    for r in records:
+        if r["_id"] not in unique_ids:
+            unique_ids.add(r["_id"])
+            deduped.append(r)
+    try:
+      collection.insert_many(deduped, ordered=False)
+      print(f"✅ Inserted {len(deduped)} records without duplicates.")
+    except BulkWriteError as bwe:
+      inserted = bwe.details.get('nInserted', 0)
+      print(f"⚠️ Inserted with some duplicate skips. Records inserted: {inserted}")
+    print("✅ Upload complete.")
+else:
+    print("⚠️ No records to upload.")

download_model.py ADDED Viewed

	@@ -0,0 +1,51 @@

+# download_model.py
+### --- A. transformer and embedder ---
+import os
+import shutil
+from huggingface_hub import snapshot_download
+# Set up paths
+MODEL_REPO = "sentence-transformers/all-MiniLM-L6-v2"
+MODEL_CACHE_DIR = "/app/model_cache"
+print("⏳ Downloading the SentenceTransformer model...")
+model_path = snapshot_download(repo_id=MODEL_REPO, cache_dir=MODEL_CACHE_DIR)
+print("Model path: ", model_path)
+# Ensure the directory exists
+if not os.path.exists(MODEL_CACHE_DIR):
+    os.makedirs(MODEL_CACHE_DIR)
+# Move all contents from the snapshot folder
+if os.path.exists(model_path):
+    print(f"📂 Moving model files from {model_path} to {MODEL_CACHE_DIR}...")
+    for item in os.listdir(model_path):
+        source = os.path.join(model_path, item)
+        destination = os.path.join(MODEL_CACHE_DIR, item)
+        if os.path.isdir(source):
+            shutil.copytree(source, destination, dirs_exist_ok=True)
+        else:
+            shutil.copy2(source, destination)
+    print(f"✅ Model extracted and flattened in {MODEL_CACHE_DIR}")
+else:
+    print("❌ No snapshot directory found!")
+    exit(1)
+# Verify structure after moving
+print("\n📂 LLM Model Structure (Build Level):")
+for root, dirs, files in os.walk(MODEL_CACHE_DIR):
+    print(f"📁 {root}/")
+    for file in files:
+        print(f"  📄 {file}")
+### --- B. translation modules ---
+from transformers import pipeline
+print("⏬ Downloading Vietnamese–English translator...")
+_ = pipeline("translation", model="VietAI/envit5-translation", src_lang="vi", tgt_lang="en")
+print("⏬ Downloading Chinese–English translator...")
+_ = pipeline("translation", model="Helsinki-NLP/opus-mt-zh-en")

memory.py ADDED Viewed

	@@ -0,0 +1,426 @@

+# memory.py
+import re, time, hashlib, asyncio, os
+from collections import defaultdict, deque
+from typing import List, Dict
+import numpy as np
+import faiss
+from sentence_transformers import SentenceTransformer
+from google import genai  # must be configured in app.py and imported globally
+import logging
+_LLM_SMALL = "gemini-2.5-flash-lite-preview-06-17"
+# Load embedding model
+EMBED = SentenceTransformer("/app/model_cache", device="cpu").half()
+logger = logging.getLogger("rag-agent")
+logging.basicConfig(level=logging.INFO, format="%(asctime)s — %(name)s — %(levelname)s — %(message)s", force=True) # Change INFO to DEBUG for full-ctx JSON loader
+api_key = os.getenv("FlashAPI")
+client = genai.Client(api_key=api_key)
+class MemoryManager:
+    def __init__(self, max_users=1000, history_per_user=20, max_chunks=60):
+        # STM: recent conversation summaries (topic + summary), up to 5 entries
+        self.stm_summaries = defaultdict(lambda: deque(maxlen=history_per_user))  # deque of {topic,text,vec,timestamp,used}
+        # Legacy raw cache (kept for compatibility if needed)
+        self.text_cache   = defaultdict(lambda: deque(maxlen=history_per_user))
+        # LTM: semantic chunk store (approx 3 chunks x 20 rounds)
+        self.chunk_index  = defaultdict(self._new_index)     # user_id -> faiss index
+        self.chunk_meta   = defaultdict(list)                #  ''  -> list[{text,tag,vec,timestamp,used}]
+        self.user_queue   = deque(maxlen=max_users)          # LRU of users
+        self.max_chunks   = max_chunks                       # hard cap per user
+        self.chunk_cache  = {}                               # hash(query+resp) -> [chunks]
+    # ---------- Public API ----------
+    def add_exchange(self, user_id: str, query: str, response: str, lang: str = "EN"):
+        self._touch_user(user_id)
+        # Keep raw record (optional)
+        self.text_cache[user_id].append(((query or "").strip(), (response or "").strip()))
+        if not response: return []
+        # Avoid re-chunking identical response
+        cache_key = hashlib.md5((query + response).encode()).hexdigest()
+        if cache_key in self.chunk_cache:
+            chunks = self.chunk_cache[cache_key]
+        else:
+            chunks = self.chunk_response(response, lang, question=query)
+            self.chunk_cache[cache_key] = chunks
+        # Update STM with merging/deduplication
+        for chunk in chunks:
+            self._upsert_stm(user_id, chunk, lang)
+        # Update LTM with merging/deduplication
+        self._upsert_ltm(user_id, chunks, lang)
+        return chunks
+    def get_relevant_chunks(self, user_id: str, query: str, top_k: int = 3, min_sim: float = 0.30) -> List[str]:
+        """Return texts of chunks whose cosine similarity ≥ min_sim."""
+        if self.chunk_index[user_id].ntotal == 0:
+            return []
+        # Encode chunk
+        qvec   = self._embed(query)
+        sims, idxs = self.chunk_index[user_id].search(np.array([qvec]), k=top_k)
+        results = []
+        # Append related result with smart-decay to optimize storage and prioritize most-recent chat
+        for sim, idx in zip(sims[0], idxs[0]):
+            if idx < len(self.chunk_meta[user_id]) and sim >= min_sim:
+                chunk = self.chunk_meta[user_id][idx]
+                chunk["used"] += 1  # increment usage
+                # Decay function
+                age_sec = time.time() - chunk["timestamp"]
+                decay = 1.0 / (1.0 + age_sec / 300)  # 5-min half-life
+                score = sim * decay * (1 + 0.1 * chunk["used"])
+                # Append chunk with score
+                results.append((score, chunk))
+        # Sort result on best scored
+        results.sort(key=lambda x: x[0], reverse=True)
+        # logger.info(f"[Memory] RAG Retrieved Topic: {results}") # Inspect vector data
+        return [f"### Topic: {c['tag']}\n{c['text']}" for _, c in results]
+    def get_recent_chat_history(self, user_id: str, num_turns: int = 5) -> List[Dict]:
+        """
+        Get the most recent short-term memory summaries.
+        Returns: a list of entries containing only the summarized bot context.
+        """
+        if user_id not in self.stm_summaries:
+            return []
+        recent = list(self.stm_summaries[user_id])[-num_turns:]
+        formatted = []
+        for entry in recent:
+            formatted.append({
+                "user": "",
+                "bot": f"Topic: {entry['topic']}\n{entry['text']}",
+                "timestamp": entry.get("timestamp", time.time())
+            })
+        return formatted
+    def get_context(self, user_id: str, num_turns: int = 5) -> str:
+        # Prefer STM summaries
+        history = self.get_recent_chat_history(user_id, num_turns=num_turns)
+        return "\n".join(h["bot"] for h in history)
+    def get_contextual_chunks(self, user_id: str, current_query: str, lang: str = "EN") -> str:
+        """
+        Use Gemini Flash Lite to create a summarization of relevant context from both recent history and RAG chunks.
+        This ensures conversational continuity while providing a concise summary for the main LLM.
+        """
+        # Get both types of context
+        recent_history = self.get_recent_chat_history(user_id, num_turns=5)
+        rag_chunks = self.get_relevant_chunks(user_id, current_query, top_k=3)
+        logger.info(f"[Contextual] Retrieved {len(recent_history)} recent history items")
+        logger.info(f"[Contextual] Retrieved {len(rag_chunks)} RAG chunks")
+        # Return empty string if no context is found
+        if not recent_history and not rag_chunks:
+            logger.info(f"[Contextual] No context found, returning empty string")
+            return ""
+        # Prepare context for Gemini to summarize
+        context_parts = []
+        # Add recent chat history
+        if recent_history:
+            history_text = "\n".join([
+                f"User: {item['user']}\nBot: {item['bot']}"
+                for item in recent_history
+            ])
+            context_parts.append(f"Recent conversation history:\n{history_text}")
+        # Add RAG chunks
+        if rag_chunks:
+            rag_text = "\n".join(rag_chunks)
+            context_parts.append(f"Semantically relevant historical medical information:\n{rag_text}")
+        # Build summarization prompt
+        summarization_prompt = f"""
+        You are a medical assistant creating a concise summary of conversation context for continuity.
+        Current user query: "{current_query}"
+        Available context information:
+        {chr(10).join(context_parts)}
+        Task: Create a brief, coherent summary that captures the key points from the conversation history and relevant medical information that are important for understanding the current query.
+        Guidelines:
+        1. Focus on medical symptoms, diagnoses, treatments, or recommendations mentioned
+        2. Include any patient concerns or questions that are still relevant
+        3. Highlight any follow-up needs or pending clarifications
+        4. Keep the summary concise but comprehensive enough for context
+        5. Maintain conversational flow and continuity
+        Output: Provide a single, well-structured summary paragraph that can be used as context for the main LLM to provide a coherent response.
+        If no relevant context exists, return "No relevant context found."
+        Language context: {lang}
+        """
+        logger.debug(f"[Contextual] Full prompt: {summarization_prompt}")
+        # Loop through the prompt and log the length of each part
+        try:
+            # Use Gemini Flash Lite for summarization
+            client = genai.Client(api_key=os.getenv("FlashAPI"))
+            result = client.models.generate_content(
+                model=_LLM_SMALL,
+                contents=summarization_prompt
+            )
+            summary = result.text.strip()
+            if "No relevant context found" in summary:
+                logger.info(f"[Contextual] Gemini indicated no relevant context found")
+                return ""
+            logger.info(f"[Contextual] Gemini created summary: {summary[:100]}...")
+            return summary
+        except Exception as e:
+            logger.warning(f"[Contextual] Gemini summarization failed: {e}")
+            logger.info(f"[Contextual] Using fallback summarization method")
+            # Fallback: create a simple summary
+            fallback_summary = []
+            # Fallback: add recent history
+            if recent_history:
+                recent_summary = f"Recent conversation: User asked about {recent_history[-1]['user'][:50]}... and received a response about {recent_history[-1]['bot'][:50]}..."
+                fallback_summary.append(recent_summary)
+                logger.info(f"[Contextual] Fallback: Added recent history summary")
+            # Fallback: add RAG chunks
+            if rag_chunks:
+                rag_summary = f"Relevant medical information: {len(rag_chunks)} chunks found covering various medical topics."
+                fallback_summary.append(rag_summary)
+                logger.info(f"[Contextual] Fallback: Added RAG chunks summary")
+            final_fallback = " ".join(fallback_summary) if fallback_summary else ""
+            return final_fallback
+    def reset(self, user_id: str):
+        self._drop_user(user_id)
+    # ---------- Internal helpers ----------
+    def _touch_user(self, user_id: str):
+        if user_id not in self.text_cache and len(self.user_queue) >= self.user_queue.maxlen:
+            self._drop_user(self.user_queue.popleft())
+        if user_id in self.user_queue:
+            self.user_queue.remove(user_id)
+        self.user_queue.append(user_id)
+    def _drop_user(self, user_id: str):
+        self.text_cache.pop(user_id, None)
+        self.chunk_index.pop(user_id, None)
+        self.chunk_meta.pop(user_id, None)
+        if user_id in self.user_queue:
+            self.user_queue.remove(user_id)
+    def _rebuild_index(self, user_id: str, keep_last: int):
+        """Trim chunk list + rebuild FAISS index for user."""
+        self.chunk_meta[user_id] = self.chunk_meta[user_id][-keep_last:]
+        index = self._new_index()
+        # Store each chunk's vector once and reuse it.
+        for chunk in self.chunk_meta[user_id]:
+            index.add(np.array([chunk["vec"]]))
+        self.chunk_index[user_id] = index
+    @staticmethod
+    def _new_index():
+        # Use cosine similarity (vectors must be L2-normalised)
+        return faiss.IndexFlatIP(384)
+    @staticmethod
+    def _embed(text: str):
+        vec = EMBED.encode(text, convert_to_numpy=True)
+        # L2 normalise for cosine on IndexFlatIP
+        return vec / (np.linalg.norm(vec) + 1e-9)
+    def chunk_response(self, response: str, lang: str, question: str = "") -> List[Dict]:
+        """
+        Calls Gemini to:
+          - Translate (if needed)
+          - Chunk by context/topic (exclude disclaimer section)
+          - Summarise
+        Returns: [{"tag": ..., "text": ...}, ...]
+        """
+        if not response: return []
+        # Gemini instruction
+        instructions = []
+        # if lang.upper() != "EN":
+        #     instructions.append("- Translate the response to English.")
+        instructions.append("- Break the translated (or original) text into semantically distinct parts, grouped by medical topic, symptom, assessment, plan, or instruction (exclude disclaimer section).")
+        instructions.append("- For each part, generate a clear, concise summary. The summary may vary in length depending on the complexity of the topic — do not omit key clinical instructions and exact medication names/doses if present.")
+        instructions.append("- At the start of each part, write `Topic: <concise but specific sentence (10-20 words) capturing patient context, condition, and action>`.")
+        instructions.append("- Separate each part using three dashes `---` on a new line.")
+        # if lang.upper() != "EN":
+        #     instructions.append(f"Below is the user-provided medical response written in `{lang}`")
+        # Gemini prompt
+        prompt = f"""
+        You are a medical assistant helping organize and condense a clinical response.
+        If helpful, use the user's latest question for context to craft specific topics.
+        User's latest question (context): {question}
+        ------------------------
+        {response}
+        ------------------------
+        Please perform the following tasks:
+        {chr(10).join(instructions)}
+        Output only the structured summaries, separated by dashes.
+        """
+        retries = 0
+        while retries < 5:
+            try:
+                client = genai.Client(api_key=os.getenv("FlashAPI"))
+                result = client.models.generate_content(
+                    model=_LLM_SMALL,
+                    contents=prompt
+                    # ,generation_config={"temperature": 0.4} # Skip temp configs for gem-flash
+                )
+                output = result.text.strip()
+                logger.info(f"[Memory] 📦 Gemini summarized chunk output: {output}")
+                return [
+                    {"tag": self._quick_extract_topic(chunk), "text": chunk.strip()}
+                    for chunk in output.split('---') if chunk.strip()
+                ]
+            except Exception as e:
+                logger.warning(f"[Memory] ❌ Gemini chunking failed: {e}")
+                retries += 1
+                time.sleep(0.5)
+        return [{"tag": "general", "text": response.strip()}]  # fallback
+    @staticmethod
+    def _quick_extract_topic(chunk: str) -> str:
+        """Heuristically extract the topic from a chunk (title line or first 3 words)."""
+        # Expecting 'Topic: <something>'
+        match = re.search(r'^Topic:\s*(.+)', chunk, re.IGNORECASE | re.MULTILINE)
+        if match:
+            return match.group(1).strip()
+        lines = chunk.strip().splitlines()
+        for line in lines:
+            if len(line.split()) <= 8 and line.strip().endswith(":"):
+                return line.strip().rstrip(":")
+        return " ".join(chunk.split()[:3]).rstrip(":.,")
+    # ---------- New merging/dedup logic ----------
+    def _upsert_stm(self, user_id: str, chunk: Dict, lang: str):
+        """Insert or merge a summarized chunk into STM with semantic dedup/merge.
+        Identical: replace the older with new. Partially similar: merge extra details from older into newer.
+        """
+        topic = self._enrich_topic(chunk.get("tag", ""), chunk.get("text", ""))
+        text  = chunk.get("text", "").strip()
+        vec   = self._embed(text)
+        now   = time.time()
+        entry = {"topic": topic, "text": text, "vec": vec, "timestamp": now, "used": 0}
+        stm = self.stm_summaries[user_id]
+        if not stm:
+            stm.append(entry)
+            return
+        # find best match
+        best_idx = -1
+        best_sim = -1.0
+        for i, e in enumerate(stm):
+            sim = float(np.dot(vec, e["vec"]))
+            if sim > best_sim:
+                best_sim = sim
+                best_idx = i
+        if best_sim >= 0.92:  # nearly identical
+            # replace older with current
+            stm.rotate(-best_idx)
+            stm.popleft()
+            stm.rotate(best_idx)
+            stm.append(entry)
+        elif best_sim >= 0.75:  # partially similar → merge
+            base = stm[best_idx]
+            merged_text = self._merge_texts(new_text=text, old_text=base["text"])  # add bits from old not in new
+            merged_topic = base["topic"] if len(base["topic"]) > len(topic) else topic
+            merged_vec = self._embed(merged_text)
+            merged_entry = {"topic": merged_topic, "text": merged_text, "vec": merged_vec, "timestamp": now, "used": base.get("used", 0)}
+            stm.rotate(-best_idx)
+            stm.popleft()
+            stm.rotate(best_idx)
+            stm.append(merged_entry)
+        else:
+            stm.append(entry)
+    def _upsert_ltm(self, user_id: str, chunks: List[Dict], lang: str):
+        """Insert or merge chunks into LTM with semantic dedup/merge, then rebuild index.
+        Keeps only the most recent self.max_chunks entries.
+        """
+        current_list = self.chunk_meta[user_id]
+        for chunk in chunks:
+            text = chunk.get("text", "").strip()
+            if not text:
+                continue
+            vec = self._embed(text)
+            topic = self._enrich_topic(chunk.get("tag", ""), text)
+            now = time.time()
+            new_entry = {"tag": topic, "text": text, "vec": vec, "timestamp": now, "used": 0}
+            if not current_list:
+                current_list.append(new_entry)
+                continue
+            # find best similar entry
+            best_idx = -1
+            best_sim = -1.0
+            for i, e in enumerate(current_list):
+                sim = float(np.dot(vec, e["vec"]))
+                if sim > best_sim:
+                    best_sim = sim
+                    best_idx = i
+            if best_sim >= 0.92:
+                # replace older with new
+                current_list[best_idx] = new_entry
+            elif best_sim >= 0.75:
+                # merge details
+                base = current_list[best_idx]
+                merged_text = self._merge_texts(new_text=text, old_text=base["text"])  # add unique sentences from old
+                merged_topic = base["tag"] if len(base["tag"]) > len(topic) else topic
+                merged_vec = self._embed(merged_text)
+                current_list[best_idx] = {"tag": merged_topic, "text": merged_text, "vec": merged_vec, "timestamp": now, "used": base.get("used", 0)}
+            else:
+                current_list.append(new_entry)
+        # Trim and rebuild index
+        if len(current_list) > self.max_chunks:
+            current_list[:] = current_list[-self.max_chunks:]
+        self._rebuild_index(user_id, keep_last=self.max_chunks)
+    @staticmethod
+    def _split_sentences(text: str) -> List[str]:
+        # naive sentence splitter by ., !, ?
+        parts = re.split(r"(?<=[\.!?])\s+", text.strip())
+        return [p.strip() for p in parts if p.strip()]
+    def _merge_texts(self, new_text: str, old_text: str) -> str:
+        """Append sentences from old_text that are not already contained in new_text (by fuzzy match)."""
+        new_sents = self._split_sentences(new_text)
+        old_sents = self._split_sentences(old_text)
+        new_set = set(s.lower() for s in new_sents)
+        merged = list(new_sents)
+        for s in old_sents:
+            s_norm = s.lower()
+            # consider present if significant overlap with any existing sentence
+            if s_norm in new_set:
+                continue
+            # simple containment check
+            if any(self._overlap_ratio(s_norm, t.lower()) > 0.8 for t in merged):
+                continue
+            merged.append(s)
+        return " ".join(merged)
+    @staticmethod
+    def _overlap_ratio(a: str, b: str) -> float:
+        """Compute token overlap ratio between two sentences."""
+        ta = set(re.findall(r"\w+", a))
+        tb = set(re.findall(r"\w+", b))
+        if not ta or not tb:
+            return 0.0
+        inter = len(ta & tb)
+        union = len(ta | tb)
+        return inter / union
+    @staticmethod
+    def _enrich_topic(topic: str, text: str) -> str:
+        """Make topic more descriptive if it's too short by using the first sentence of the text.
+        Does not call LLM to keep latency low.
+        """
+        topic = (topic or "").strip()
+        if len(topic.split()) < 5 or len(topic) < 20:
+            sents = re.split(r"(?<=[\.!?])\s+", text.strip())
+            if sents:
+                first = sents[0]
+                # cap to ~16 words
+                words = first.split()
+                if len(words) > 16:
+                    first = " ".join(words[:16])
+                # ensure capitalized
+                return first.strip().rstrip(':')
+        return topic

migrate.py ADDED Viewed

	@@ -0,0 +1,48 @@

+# Running this script to split FAISS index collection to the second/different cluster.
+from pymongo import MongoClient
+from dotenv import load_dotenv
+import os
+# Load environment variables from .env
+load_dotenv()
+# Connection strings (update as needed)
+mongo_uri = os.getenv("MONGO_URI")  # QA cluster connection string
+index_uri = os.getenv("INDEX_URI")  # FAISS index cluster connection string
+if not mongo_uri:
+    raise ValueError("MONGO_URI is missing!")
+if not index_uri:
+    raise ValueError("INDEX_URI is missing!")
+# Connect to the QA cluster (where FAISS data was accidentally stored)
+qa_client = MongoClient(mongo_uri)
+qa_db = qa_client["MedicalChatbotDB"]
+# Connect to the FAISS index cluster
+faiss_client = MongoClient(index_uri)
+faiss_db = faiss_client["MedicalChatbotDB"]  # Use the same database name if desired
+# Define the GridFS collections to move.
+# In GridFS, files are stored in two collections: "<bucket>.files" and "<bucket>.chunks".
+source_files = qa_db["faiss_index_files.files"]
+source_chunks = qa_db["faiss_index_files.chunks"]
+dest_files = faiss_db["faiss_index_files.files"]
+dest_chunks = faiss_db["faiss_index_files.chunks"]
+print("Moving FAISS index GridFS files...")
+# Copy documents from the source 'files' collection
+for doc in source_files.find():
+    dest_files.insert_one(doc)
+# Copy documents from the source 'chunks' collection
+for doc in source_chunks.find():
+    dest_chunks.insert_one(doc)
+print("✅ FAISS GridFS collections moved successfully.")
+# Optionally, drop the old collections from the QA cluster to free up space:
+qa_db.drop_collection("faiss_index_files.files")
+qa_db.drop_collection("faiss_index_files.chunks")
+print("Old FAISS GridFS collections dropped from the QA cluster.")

requirements.txt ADDED Viewed

	@@ -0,0 +1,23 @@

+# requirements.txt
+# **LLMs**
+google-genai
+huggingface_hub
+# **RAG**
+faiss-cpu
+sentence-transformers
+# **NLPs**
+transformers
+accelerate
+sentencepiece
+# **Environment**
+python-dotenv       # Not used in Streamlit deployment
+pymongo
+# **VLMs**
+# transformers
+gradio_client
+pillow
+# **Deployment**
+uvicorn
+fastapi
+torch               # Reduce model load with half-precision (float16) to reduce RAM usage
+psutil              # CPU/RAM logger

translation.py ADDED Viewed

	@@ -0,0 +1,26 @@

+# translation.py
+from transformers import pipeline
+import logging
+logger = logging.getLogger("translation-agent")
+logging.basicConfig(level=logging.INFO, format="%(asctime)s — %(name)s — %(levelname)s — %(message)s", force=True) # Change INFO to DEBUG for full-ctx JSON loader
+# To use lazy model loader
+vi_en = None
+zh_en = None
+def translate_query(text: str, lang_code: str) -> str:
+    global vi_en, zh_en
+    if lang_code == "vi":
+        if vi_en is None:
+            vi_en = pipeline("translation", model="VietAI/envit5-translation", src_lang="vi", tgt_lang="en", device=-1)
+        result = vi_en(text, max_length=512)[0]["translation_text"]
+        logger.info(f"[En-Vi] Query in `{lang_code}` translated to: {result}")
+        return result
+    elif lang_code == "zh":
+        if zh_en is None:
+            zh_en = pipeline("translation", model="Helsinki-NLP/opus-mt-zh-en", device=-1)
+        result = zh_en(text, max_length=512)[0]["translation_text"]
+        logger.info(f"[En-Zh] Query in `{lang_code}` translated to: {result}")
+        return result
+    return text

vlm.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import os, logging, traceback, json, base64
+from io import BytesIO
+from PIL import Image
+from translation import translate_query
+from gradio_client import Client, handle_file
+import tempfile
+logger = logging.getLogger("vlm-agent")
+logging.basicConfig(level=logging.INFO, format="%(asctime)s — %(name)s — %(levelname)s — %(message)s", force=True)
+# ✅ Load Gradio client once
+gr_client = None
+def load_gradio_client():
+    global gr_client
+    if gr_client is None:
+        logger.info("[VLM] ⏳ Connecting to MedGEMMA Gradio Space...")
+        gr_client = Client("warshanks/medgemma-4b-it")
+        logger.info("[VLM] Gradio MedGEMMA client ready.")
+    return gr_client
+def process_medical_image(base64_image: str, prompt: str = None, lang: str = "EN") -> str:
+    if not prompt:
+        prompt = "Describe and investigate any clinical findings from this medical image."
+    elif lang.upper() in {"VI", "ZH"}:
+        prompt = translate_query(prompt, lang.lower())
+    try:
+        # 1️⃣ Decode base64 image to temp file
+        image_data = base64.b64decode(base64_image)
+        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
+            tmp.write(image_data)
+            tmp.flush()
+            image_path = tmp.name
+        # 2️⃣ Send to Gradio MedGEMMA
+        client = load_gradio_client()
+        logger.info(f"[VLM] Sending prompt: {prompt}")
+        result = client.predict(
+            message={"text": prompt, "files": [handle_file(image_path)]},
+            param_2 = "You analyze medical images and report abnormalities, diseases with clear diagnostic insight.",
+            param_3=2048,
+            api_name="/chat"
+        )
+        if isinstance(result, str):
+            logger.info(f"[VLM] ✅ Response: {result}")
+            return result.strip()
+        else:
+            logger.warning(f"[VLM] ⚠️ Unexpected result type: {type(result)} — {result}")
+            return str(result)
+    except Exception as e:
+        logger.error(f"[VLM] ❌ Exception: {e}")
+        logger.error(f"[VLM] 🔍 Traceback:\n{traceback.format_exc()}")
+        return f"[VLM] ⚠️ Failed to process image: {e}"

warmup.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from sentence_transformers import SentenceTransformer
+import torch
+print("🚀 Warming up model...")
+embedding_model = SentenceTransformer("/app/model_cache", device="cpu")
+embedding_model = embedding_model.half()  # Reduce memory
+embedding_model.to(torch.device("cpu"))
+print("✅ Model warm-up complete!")