Spaces:

aursalan
/

latch_candidates

Running

App Files Files Community

aursalan commited on 19 days ago

Commit

f5f1dc9

1 Parent(s): 33afddb

Added update

Browse files

Files changed (2) hide show

__pycache__/main.cpython-313.pyc +0 -0
main.py +125 -64

__pycache__/main.cpython-313.pyc ADDED Viewed

Binary file (12.7 kB). View file

main.py CHANGED Viewed

@@ -21,16 +21,15 @@ DRY_RUN = False
 # --- Global State ---
 model = None
-execution_logs = deque(maxlen=50) # Stores the last 50 batch logs in RAM
-is_processing = False # Lock to prevent overlapping pings
 processing_lock = threading.Lock()
-# --- Lifespan Manager (Loads Model on Startup) ---
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     global model
     print("⏳ Loading Model...")
-    # Load model once when the API starts
     model = SentenceTransformer('Alibaba-NLP/gte-modernbert-base', trust_remote_code=True)
     print("✅ Model Loaded.")
     yield
@@ -41,66 +40,136 @@ app = FastAPI(lifespan=lifespan)
 # --- Helper Functions ---
 def fetch_and_lock_chunk(conn, chunk_size):
     query = """
-    WITH locked_candidates AS (
-        SELECT candidate_id, candidate_name
-        FROM candidate_profiles
-        WHERE
-            candidate_embeddings IS NULL
-            OR updated_at > candidate_embeddings_updated_at
-        LIMIT %s
-        FOR UPDATE SKIP LOCKED
-    )
     SELECT
-        lc.candidate_id,
-        lc.candidate_name,
-        (SELECT json_agg(DISTINCT role) FROM candidate_experiences WHERE candidate_id = lc.candidate_id AND role IS NOT NULL) AS experience_roles,
-        (SELECT json_agg(DISTINCT experience_description) FROM candidate_experiences WHERE candidate_id = lc.candidate_id AND experience_description IS NOT NULL) AS experience_descriptions,
-        (SELECT json_agg(DISTINCT s.skill_name) FROM candidate_skill_map csm JOIN skills s ON csm.skill_id = s.skill_id WHERE csm.candidate_id = lc.candidate_id) AS skills,
-        (SELECT json_agg(DISTINCT project_description) FROM candidate_projects WHERE candidate_id = lc.candidate_id AND project_description IS NOT NULL) AS project_descriptions,
-        (SELECT json_agg(DISTINCT degree) FROM candidate_education WHERE candidate_id = lc.candidate_id AND degree IS NOT NULL) AS degrees,
-        (SELECT json_agg(DISTINCT coursework) FROM candidate_education WHERE candidate_id = lc.candidate_id AND coursework IS NOT NULL) AS courseworks,
-        (SELECT json_agg(DISTINCT university) FROM candidate_education WHERE candidate_id = lc.candidate_id AND university IS NOT NULL) AS universities,
-        (SELECT json_agg(DISTINCT certificate_name) FROM candidate_certifications WHERE candidate_id = lc.candidate_id AND certificate_name IS NOT NULL) AS certifications,
-        (SELECT json_agg(DISTINCT achievement_description) FROM candidate_achievements WHERE candidate_id = lc.candidate_id AND achievement_description IS NOT NULL) AS achievements
-    FROM locked_candidates lc;
     """
     return pd.read_sql_query(query, conn, params=(chunk_size,))
 def clean_and_format_text(row):
-    field_config = [
-        ('skills', 'Skills'),
-        ('experience_roles', 'Past Roles'),
-        ('project_descriptions', 'Projects'),
-        ('experience_descriptions', 'Experience Details'),
-        ('degrees', 'Education - Degrees'),
-        ('certifications', 'Certifications'),
-        ('achievements', 'Achievements'),
-    ]
     text_parts = []
-    for col_name, tag in field_config:
-        if col_name in row:
-            data = row[col_name]
-            if isinstance(data, list) and len(data) > 0:
-                clean_items = [str(item).strip() for item in data if item is not None and str(item).strip()]
-                if clean_items:
-                    text_parts.append(f"{tag}: " + ", ".join(clean_items))
-            elif isinstance(data, str) and data.strip():
-                text_parts.append(f"{tag}: {data.strip()}")
-    return "\n".join(text_parts)
 def update_db_batch(conn, updates):
     if DRY_RUN: return
     query = """
-        UPDATE candidate_profiles AS cp
-        SET candidate_embeddings = data.vector::vector,
-            candidate_embeddings_updated_at = NOW()
         FROM (VALUES %s) AS data (id, vector)
-        WHERE cp.candidate_id = data.id
     """
     cursor = conn.cursor()
     try:
@@ -116,7 +185,7 @@ def run_worker_logic():
     """
     The core logic that runs one single batch processing.
     """
-    log_buffer = [] # Local buffer to capture logs for this specific run
     timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
     log_buffer.append(f"<b>BATCH RUN: {timestamp}</b>")
@@ -131,7 +200,6 @@ def run_worker_logic():
         if df.empty:
             conn.rollback()
             log_buffer.append("💤 No pending candidates found.")
-            # Add to global logs and exit
             execution_logs.appendleft("<br>".join(log_buffer))
             return "No data"
@@ -143,7 +211,8 @@ def run_worker_logic():
         # 3. Log Inputs (For the Root API view)
         for index, row in df.iterrows():
             log_buffer.append(f"<div style='border:1px solid #ccc; margin:5px; padding:5px; background:#f9f9f9'>")
-            log_buffer.append(f"<strong>ID: {row['candidate_id']} ({row.get('candidate_name', 'Unknown')})</strong>")
             log_buffer.append(f"<pre style='white-space: pre-wrap;'>{row['full_text']}</pre>")
             log_buffer.append("</div>")
@@ -157,7 +226,8 @@ def run_worker_logic():
         )
         # 5. Update DB
-        updates = list(zip(df['candidate_id'].tolist(), embeddings.tolist()))
         if not DRY_RUN:
             update_db_batch(conn, updates)
@@ -172,16 +242,12 @@ def run_worker_logic():
         print(f"Error: {e}")
     finally:
         if conn: conn.close()
-        # Push the local buffer to the global execution log
         execution_logs.appendleft("<br>".join(log_buffer))
 # --- API Endpoints ---
 @app.get("/", response_class=HTMLResponse)
 async def read_root():
-    """
-    Root endpoint: Displays the logs of recent processing batches.
-    """
     html_content = """
     <html>
         <head>
@@ -193,7 +259,7 @@ async def read_root():
             </style>
         </head>
         <body>
-            <h1>📜 Background Worker Execution Logs</h1>
             <p><i>Most recent batches shown first.</i></p>
             <hr>
     """
@@ -209,18 +275,13 @@ async def read_root():
 @app.get("/trigger-batch")
 async def trigger_processing(background_tasks: BackgroundTasks):
-    """
-    External Pinger: Hits this endpoint to trigger one batch of processing.
-    """
     if processing_lock.locked():
         return {"status": "busy", "message": "Worker is currently processing a previous batch."}
-    # We run the worker in a background task so the API response is fast
     background_tasks.add_task(wrapped_worker)
     return {"status": "started", "message": "Batch processing started in background."}
 def wrapped_worker():
-    """Thread-safe wrapper for the worker logic"""
     if processing_lock.acquire(blocking=False):
         try:
             run_worker_logic()

 # --- Global State ---
 model = None
+execution_logs = deque(maxlen=50)
+is_processing = False
 processing_lock = threading.Lock()
+# --- Lifespan Manager ---
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     global model
     print("⏳ Loading Model...")
     model = SentenceTransformer('Alibaba-NLP/gte-modernbert-base', trust_remote_code=True)
     print("✅ Model Loaded.")
     yield
 # --- Helper Functions ---
 def fetch_and_lock_chunk(conn, chunk_size):
+    """
+    Fetches candidates from the denormalized table where embeddings are missing.
+    """
     query = """
     SELECT
+        id,
+        name,
+        summary,
+        work_experience,
+        projects,
+        education,
+        achievements,
+        certifications,
+        volunteering,
+        skills,
+        languages
+    FROM public.candidates
+    WHERE
+        embeddings IS NULL
+    FOR UPDATE SKIP LOCKED
+    LIMIT %s
     """
+    # Note: If you add an 'updated_at' column later, change WHERE to:
+    # WHERE embeddings IS NULL OR updated_at > embeddings_created_at
     return pd.read_sql_query(query, conn, params=(chunk_size,))
 def clean_and_format_text(row):
+    """
+    Parses the JSONB and Array columns from the new schema to create a
+    rich text representation for embedding.
+    """
     text_parts = []
+    # 1. Basic Info
+    if row.get('name'):
+        text_parts.append(f"Name: {row['name']}")
+    if row.get('summary'):
+        text_parts.append(f"Summary: {row['summary']}")
+    # 2. Skills (Postgres Array -> Python List)
+    if row.get('skills') and isinstance(row['skills'], list):
+        # Filter out empty strings/None
+        valid_skills = [s for s in row['skills'] if s]
+        if valid_skills:
+            text_parts.append(f"Skills: {', '.join(valid_skills)}")
+    # 3. Work Experience (JSONB List of Dicts)
+    # Schema keys: role, company, description, duration
+    if row.get('work_experience') and isinstance(row['work_experience'], list):
+        exps = []
+        for item in row['work_experience']:
+            if isinstance(item, dict):
+                role = item.get('role', '')
+                company = item.get('company', '')
+                desc = item.get('description', '')
+                # Format: "Role at Company: Description"
+                entry = f"{role} at {company}".strip()
+                if desc:
+                    entry += f": {desc}"
+                exps.append(entry)
+        if exps:
+            text_parts.append("Work Experience:\n" + "\n".join(exps))
+    # 4. Projects (JSONB List of Dicts)
+    # Schema keys: title, description, link
+    if row.get('projects') and isinstance(row['projects'], list):
+        projs = []
+        for item in row['projects']:
+            if isinstance(item, dict):
+                title = item.get('title', '')
+                desc = item.get('description', '')
+                entry = f"{title}".strip()
+                if desc:
+                    entry += f": {desc}"
+                projs.append(entry)
+        if projs:
+            text_parts.append("Projects:\n" + "\n".join(projs))
+    # 5. Education (JSONB List of Dicts)
+    # Schema keys: degree, institution, year
+    if row.get('education') and isinstance(row['education'], list):
+        edus = []
+        for item in row['education']:
+            if isinstance(item, dict):
+                degree = item.get('degree', '')
+                inst = item.get('institution', '')
+                entry = f"{degree} from {inst}".strip()
+                edus.append(entry)
+        if edus:
+            text_parts.append("Education: " + ", ".join(edus))
+    # 6. Certifications (JSONB List of Dicts)
+    # Schema keys: name, issuer
+    if row.get('certifications') and isinstance(row['certifications'], list):
+        certs = []
+        for item in row['certifications']:
+            if isinstance(item, dict):
+                name = item.get('name', '')
+                issuer = item.get('issuer', '')
+                entry = f"{name} by {issuer}".strip()
+                certs.append(entry)
+        if certs:
+            text_parts.append("Certifications: " + ", ".join(certs))
+    # 7. Achievements (JSONB List of Dicts)
+    if row.get('achievements') and isinstance(row['achievements'], list):
+        achievements = []
+        for item in row['achievements']:
+            if isinstance(item, dict):
+                title = item.get('title', '')
+                desc = item.get('description', '')
+                entry = f"{title}: {desc}".strip()
+                achievements.append(entry)
+        if achievements:
+             text_parts.append("Achievements: " + "; ".join(achievements))
+    return "\n\n".join(text_parts)
 def update_db_batch(conn, updates):
     if DRY_RUN: return
+    # Updated to target public.candidates and cast ID to UUID
     query = """
+        UPDATE public.candidates AS c
+        SET embeddings = data.vector::vector,
+            embeddings_created_at = NOW()
         FROM (VALUES %s) AS data (id, vector)
+        WHERE c.id = data.id::uuid
     """
     cursor = conn.cursor()
     try:
     """
     The core logic that runs one single batch processing.
     """
+    log_buffer = []
     timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
     log_buffer.append(f"<b>BATCH RUN: {timestamp}</b>")
         if df.empty:
             conn.rollback()
             log_buffer.append("💤 No pending candidates found.")
             execution_logs.appendleft("<br>".join(log_buffer))
             return "No data"
         # 3. Log Inputs (For the Root API view)
         for index, row in df.iterrows():
             log_buffer.append(f"<div style='border:1px solid #ccc; margin:5px; padding:5px; background:#f9f9f9'>")
+            # row['id'] is now the UUID
+            log_buffer.append(f"<strong>ID: {row['id']} ({row.get('name', 'Unknown')})</strong>")
             log_buffer.append(f"<pre style='white-space: pre-wrap;'>{row['full_text']}</pre>")
             log_buffer.append("</div>")
         )
         # 5. Update DB
+        # Ensure ID is converted to string for the tuple list if it isn't already
+        updates = list(zip(df['id'].astype(str).tolist(), embeddings.tolist()))
         if not DRY_RUN:
             update_db_batch(conn, updates)
         print(f"Error: {e}")
     finally:
         if conn: conn.close()
         execution_logs.appendleft("<br>".join(log_buffer))
 # --- API Endpoints ---
 @app.get("/", response_class=HTMLResponse)
 async def read_root():
     html_content = """
     <html>
         <head>
             </style>
         </head>
         <body>
+            <h1>📜 Candidates Embedding Worker</h1>
             <p><i>Most recent batches shown first.</i></p>
             <hr>
     """
 @app.get("/trigger-batch")
 async def trigger_processing(background_tasks: BackgroundTasks):
     if processing_lock.locked():
         return {"status": "busy", "message": "Worker is currently processing a previous batch."}
     background_tasks.add_task(wrapped_worker)
     return {"status": "started", "message": "Batch processing started in background."}
 def wrapped_worker():
     if processing_lock.acquire(blocking=False):
         try:
             run_worker_logic()