Spaces:

aursalan
/

latch_candidates

Running

App Files Files Community

aursalan commited on 18 days ago

Commit

33afddb

1 Parent(s): 20a1c36

Added files

Browse files

Files changed (3) hide show

Dockerfile +15 -0
main.py +228 -0
requirements.txt +7 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,15 @@

+FROM python:3.9
+WORKDIR /code
+COPY ./requirements.txt /code/requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+COPY . /code
+# Create a non-root user (security best practice for HF Spaces)
+RUN useradd -m -u 1000 user
+USER user
+# Run the application on port 7860
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

main.py ADDED Viewed

	@@ -0,0 +1,228 @@

+import psycopg2
+from psycopg2.extras import execute_values
+import pandas as pd
+from sentence_transformers import SentenceTransformer
+import os
+import datetime
+import logging
+from collections import deque
+from fastapi import FastAPI, BackgroundTasks, HTTPException
+from contextlib import asynccontextmanager
+from fastapi.responses import HTMLResponse
+import threading
+# --- Configuration ---
+SUPABASE_CONNECTION_STRING = os.getenv("SUPABASE_CONNECTION_STRING")
+# --- Toggles & Tuning ---
+PROCESSING_CHUNK_SIZE = 10
+EMBEDDING_BATCH_SIZE = 32
+DRY_RUN = False
+# --- Global State ---
+model = None
+execution_logs = deque(maxlen=50) # Stores the last 50 batch logs in RAM
+is_processing = False # Lock to prevent overlapping pings
+processing_lock = threading.Lock()
+# --- Lifespan Manager (Loads Model on Startup) ---
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    global model
+    print("⏳ Loading Model...")
+    # Load model once when the API starts
+    model = SentenceTransformer('Alibaba-NLP/gte-modernbert-base', trust_remote_code=True)
+    print("✅ Model Loaded.")
+    yield
+    print("🛑 Shutting down...")
+app = FastAPI(lifespan=lifespan)
+# --- Helper Functions ---
+def fetch_and_lock_chunk(conn, chunk_size):
+    query = """
+    WITH locked_candidates AS (
+        SELECT candidate_id, candidate_name
+        FROM candidate_profiles
+        WHERE
+            candidate_embeddings IS NULL
+            OR updated_at > candidate_embeddings_updated_at
+        LIMIT %s
+        FOR UPDATE SKIP LOCKED
+    )
+    SELECT
+        lc.candidate_id,
+        lc.candidate_name,
+        (SELECT json_agg(DISTINCT role) FROM candidate_experiences WHERE candidate_id = lc.candidate_id AND role IS NOT NULL) AS experience_roles,
+        (SELECT json_agg(DISTINCT experience_description) FROM candidate_experiences WHERE candidate_id = lc.candidate_id AND experience_description IS NOT NULL) AS experience_descriptions,
+        (SELECT json_agg(DISTINCT s.skill_name) FROM candidate_skill_map csm JOIN skills s ON csm.skill_id = s.skill_id WHERE csm.candidate_id = lc.candidate_id) AS skills,
+        (SELECT json_agg(DISTINCT project_description) FROM candidate_projects WHERE candidate_id = lc.candidate_id AND project_description IS NOT NULL) AS project_descriptions,
+        (SELECT json_agg(DISTINCT degree) FROM candidate_education WHERE candidate_id = lc.candidate_id AND degree IS NOT NULL) AS degrees,
+        (SELECT json_agg(DISTINCT coursework) FROM candidate_education WHERE candidate_id = lc.candidate_id AND coursework IS NOT NULL) AS courseworks,
+        (SELECT json_agg(DISTINCT university) FROM candidate_education WHERE candidate_id = lc.candidate_id AND university IS NOT NULL) AS universities,
+        (SELECT json_agg(DISTINCT certificate_name) FROM candidate_certifications WHERE candidate_id = lc.candidate_id AND certificate_name IS NOT NULL) AS certifications,
+        (SELECT json_agg(DISTINCT achievement_description) FROM candidate_achievements WHERE candidate_id = lc.candidate_id AND achievement_description IS NOT NULL) AS achievements
+    FROM locked_candidates lc;
+    """
+    return pd.read_sql_query(query, conn, params=(chunk_size,))
+def clean_and_format_text(row):
+    field_config = [
+        ('skills', 'Skills'),
+        ('experience_roles', 'Past Roles'),
+        ('project_descriptions', 'Projects'),
+        ('experience_descriptions', 'Experience Details'),
+        ('degrees', 'Education - Degrees'),
+        ('certifications', 'Certifications'),
+        ('achievements', 'Achievements'),
+    ]
+    text_parts = []
+    for col_name, tag in field_config:
+        if col_name in row:
+            data = row[col_name]
+            if isinstance(data, list) and len(data) > 0:
+                clean_items = [str(item).strip() for item in data if item is not None and str(item).strip()]
+                if clean_items:
+                    text_parts.append(f"{tag}: " + ", ".join(clean_items))
+            elif isinstance(data, str) and data.strip():
+                text_parts.append(f"{tag}: {data.strip()}")
+    return "\n".join(text_parts)
+def update_db_batch(conn, updates):
+    if DRY_RUN: return
+    query = """
+        UPDATE candidate_profiles AS cp
+        SET candidate_embeddings = data.vector::vector,
+            candidate_embeddings_updated_at = NOW()
+        FROM (VALUES %s) AS data (id, vector)
+        WHERE cp.candidate_id = data.id
+    """
+    cursor = conn.cursor()
+    try:
+        execute_values(cursor, query, updates)
+        conn.commit()
+    except Exception as e:
+        conn.rollback()
+        raise e
+    finally:
+        cursor.close()
+def run_worker_logic():
+    """
+    The core logic that runs one single batch processing.
+    """
+    log_buffer = [] # Local buffer to capture logs for this specific run
+    timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    log_buffer.append(f"<b>BATCH RUN: {timestamp}</b>")
+    conn = None
+    try:
+        conn = psycopg2.connect(SUPABASE_CONNECTION_STRING, sslmode='require')
+        # 1. Fetch & Lock
+        df = fetch_and_lock_chunk(conn, PROCESSING_CHUNK_SIZE)
+        if df.empty:
+            conn.rollback()
+            log_buffer.append("💤 No pending candidates found.")
+            # Add to global logs and exit
+            execution_logs.appendleft("<br>".join(log_buffer))
+            return "No data"
+        log_buffer.append(f"🔒 Locked & Processing {len(df)} candidates...")
+        # 2. Clean Text
+        df['full_text'] = df.apply(clean_and_format_text, axis=1)
+        # 3. Log Inputs (For the Root API view)
+        for index, row in df.iterrows():
+            log_buffer.append(f"<div style='border:1px solid #ccc; margin:5px; padding:5px; background:#f9f9f9'>")
+            log_buffer.append(f"<strong>ID: {row['candidate_id']} ({row.get('candidate_name', 'Unknown')})</strong>")
+            log_buffer.append(f"<pre style='white-space: pre-wrap;'>{row['full_text']}</pre>")
+            log_buffer.append("</div>")
+        # 4. Generate Embeddings
+        embeddings = model.encode(
+            df['full_text'].tolist(),
+            batch_size=EMBEDDING_BATCH_SIZE,
+            show_progress_bar=False,
+            convert_to_numpy=True,
+            normalize_embeddings=True
+        )
+        # 5. Update DB
+        updates = list(zip(df['candidate_id'].tolist(), embeddings.tolist()))
+        if not DRY_RUN:
+            update_db_batch(conn, updates)
+            log_buffer.append(f"✅ Successfully updated {len(df)} profiles.")
+        else:
+            conn.rollback()
+            log_buffer.append("⚠️ Dry Run: No DB updates made.")
+    except Exception as e:
+        if conn: conn.rollback()
+        log_buffer.append(f"❌ ERROR: {str(e)}")
+        print(f"Error: {e}")
+    finally:
+        if conn: conn.close()
+        # Push the local buffer to the global execution log
+        execution_logs.appendleft("<br>".join(log_buffer))
+# --- API Endpoints ---
+@app.get("/", response_class=HTMLResponse)
+async def read_root():
+    """
+    Root endpoint: Displays the logs of recent processing batches.
+    """
+    html_content = """
+    <html>
+        <head>
+            <title>Embedding Worker Logs</title>
+            <style>
+                body { font-family: monospace; padding: 20px; }
+                h1 { color: #333; }
+                .log-entry { margin-bottom: 20px; border-bottom: 2px solid #333; padding-bottom: 20px; }
+            </style>
+        </head>
+        <body>
+            <h1>📜 Background Worker Execution Logs</h1>
+            <p><i>Most recent batches shown first.</i></p>
+            <hr>
+    """
+    if not execution_logs:
+        html_content += "<p>No logs yet. Hit the <code>/trigger-batch</code> endpoint to start processing.</p>"
+    for entry in execution_logs:
+        html_content += f"<div class='log-entry'>{entry}</div>"
+    html_content += "</body></html>"
+    return html_content
+@app.get("/trigger-batch")
+async def trigger_processing(background_tasks: BackgroundTasks):
+    """
+    External Pinger: Hits this endpoint to trigger one batch of processing.
+    """
+    if processing_lock.locked():
+        return {"status": "busy", "message": "Worker is currently processing a previous batch."}
+    # We run the worker in a background task so the API response is fast
+    background_tasks.add_task(wrapped_worker)
+    return {"status": "started", "message": "Batch processing started in background."}
+def wrapped_worker():
+    """Thread-safe wrapper for the worker logic"""
+    if processing_lock.acquire(blocking=False):
+        try:
+            run_worker_logic()
+        finally:
+            processing_lock.release()

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+fastapi
+uvicorn
+psycopg2-binary
+pandas
+sentence-transformers
+einops
+accelerate