Spaces:

MCP-1st-Birthday
/

EastSync-AI

Running

File size: 12,283 Bytes

07273d8

"""Skill extraction from CV text using LLM."""

from __future__ import annotations

from typing import Any, Dict, List

from LLM.llm_models import cv_analyzer_model  # Using CV-specific model (same as orchestrator)
import json
import re


def _print_terminal_log(action: str, details: str = ""):
    """Print formatted log to terminal."""
    timestamp = __import__('datetime').datetime.now().strftime("%H:%M:%S")
    if details:
        print(f"[{timestamp}] [CV ANALYZER] {action} :: {details}")
    else:
        print(f"[{timestamp}] [CV ANALYZER] {action}")


def extract_skills_from_cv_text(cv_text: str, log_callback=None) -> Dict[str, Any]:
    """
    Extract skills and relevant information from CV text using LLM.
    
    Args:
        cv_text: The extracted text content from a CV
    
    Returns:
        Dictionary containing extracted skills and candidate information
    """
    
    prompt = f"""Analyze the following CV/Resume text and extract ALL relevant information in a structured format.

CV TEXT:
{cv_text}

Please extract and organize the following information:

1. TECHNICAL SKILLS: Programming languages, frameworks, tools, technologies
2. SOFT SKILLS: Communication, leadership, teamwork, problem-solving, etc.
3. PROFESSIONAL EXPERIENCE: Years of experience, job titles, companies
4. EDUCATION: Degrees, certifications, institutions
5. DOMAIN EXPERTISE: Industries, specific domains (e.g., Finance, Healthcare, AI/ML)

Return your analysis in the following JSON-like structure:
{{
    "technical_skills": ["skill1", "skill2", ...],
    "soft_skills": ["skill1", "skill2", ...],
    "experience_years": <number or "unknown">,
    "recent_roles": ["role1", "role2", ...],
    "education": ["degree1", "degree2", ...],
    "certifications": ["cert1", "cert2", ...],
    "domain_expertise": ["domain1", "domain2", ...],
    "summary": "A brief 2-3 sentence summary of the candidate's profile"
}}

Be thorough and extract as many relevant skills as possible. If information is not available, use empty arrays or "unknown"."""

    try:
        _print_terminal_log("Starting AI skill extraction from CV text")
        
        if log_callback:
            log_callback("AI Skill Extraction", {"status": "Initializing LLM model..."})
        
        # Use the CV analyzer model (same provider as orchestrator - HF/Gemini)
        model = cv_analyzer_model
        
        _print_terminal_log("LLM Initialized", f"Model ready, CV length: {len(cv_text)} chars")
        
        if log_callback:
            log_callback("AI Analysis", {"status": "Sending CV to AI for analysis", "cv_length": len(cv_text)})
        
        messages = [
            {
                "role": "system",
                "content": "You are an expert HR analyst specializing in CV/Resume analysis and skill extraction. Extract information accurately and comprehensively."
            },
            {
                "role": "user",
                "content": prompt
            }
        ]
        
        if log_callback:
            log_callback("LLM Request", {"message_count": len(messages), "model": "cv_analyzer_model"})
        
        _print_terminal_log("Sending request to AI", "Waiting for skill extraction...")
        
        response = model.generate(messages=messages)
        
        # Handle ChatMessage object - convert to string
        if hasattr(response, 'content'):
            response_text = response.content
        else:
            response_text = str(response)
        
        _print_terminal_log("AI Response Received", f"Response length: {len(response_text) if response_text else 0} chars")
        
        if log_callback:
            log_callback("AI Response Received", {"response_length": len(response_text) if response_text else 0})
        
        # Extract JSON from response (handle markdown code blocks)
        
        if log_callback:
            log_callback("Parsing AI Response", {"status": "Extracting structured data from AI response"})
        
        json_match = re.search(r'\{[\s\S]*\}', response_text)
        
        if json_match:
            if log_callback:
                log_callback("JSON Extraction", {"status": "Found JSON in response, parsing..."})
            
            _print_terminal_log("Parsing JSON response", "Extracting structured skill data...")
            
            skills_data = json.loads(json_match.group())
            
            tech_count = len(skills_data.get("technical_skills", []))
            soft_count = len(skills_data.get("soft_skills", []))
            
            _print_terminal_log("Skills Extracted Successfully", 
                              f"Technical: {tech_count}, Soft: {soft_count}, Total: {tech_count + soft_count}")
            
            if log_callback:
                log_callback("Skills Parsed Successfully", {
                    "technical_skills": tech_count,
                    "soft_skills": soft_count,
                    "total_skills": tech_count + soft_count
                })
        else:
            if log_callback:
                log_callback("JSON Extraction Failed", {"status": "No JSON found, using fallback structure"})
            # Fallback: return a basic structure with the raw response
            skills_data = {
                "technical_skills": [],
                "soft_skills": [],
                "experience_years": "unknown",
                "recent_roles": [],
                "education": [],
                "certifications": [],
                "domain_expertise": [],
                "summary": response_text[:500]  # First 500 chars
            }
        
        _print_terminal_log("✅ CV Analysis Complete", "All skills successfully extracted and structured")
        
        if log_callback:
            log_callback("✅ Extraction Complete", {"status": "CV processing finished successfully"})
        
        return skills_data
        
    except Exception as e:
        error_msg = str(e)
        
        _print_terminal_log(f"❌ ERROR: {type(e).__name__}", error_msg)
        
        if log_callback:
            log_callback("❌ AI Extraction Error", {"error": error_msg, "type": type(e).__name__})
        
        # Return error information
        return {
            "error": error_msg,
            "technical_skills": [],
            "soft_skills": [],
            "experience_years": "unknown",
            "recent_roles": [],
            "education": [],
            "certifications": [],
            "domain_expertise": [],
            "summary": f"Failed to extract skills: {error_msg}"
        }


def format_skills_for_display(skills_data: Dict[str, Any]) -> str:
    """
    Format extracted skills data into HTML for display in Gradio.
    
    Args:
        skills_data: Dictionary containing extracted skills
    
    Returns:
        HTML string for display
    """
    
    if "error" in skills_data:
        return f"""
        <div style="padding: 20px; background: var(--bg-card); border: 1px solid var(--arc-red); border-radius: 4px;">
            <h3 style="color: var(--arc-red); margin-top: 0;">⚠️ Error Extracting Skills</h3>
            <p style="color: var(--text-dim);">{skills_data.get('summary', 'Unknown error')}</p>
        </div>
        """
    
    html_parts = [
        '<div style="padding: 24px; background: var(--bg-card); border: 1px solid var(--border-bright); border-radius: 4px; margin-top: 20px;">',
        '<h2 style="color: var(--arc-orange); margin-top: 0; display: flex; align-items: center; gap: 12px;">',
        '<span style="font-size: 32px;">📄</span> CV ANALYSIS COMPLETE',
        '</h2>',
    ]
    
    # Summary
    if skills_data.get("summary"):
        html_parts.append(f'<div style="background: var(--bg-panel); padding: 16px; border-left: 4px solid var(--arc-cyan); margin-bottom: 24px;">')
        html_parts.append(f'<p style="color: var(--text-main); margin: 0; line-height: 1.6;">{skills_data["summary"]}</p>')
        html_parts.append('</div>')
    
    # Technical Skills
    if skills_data.get("technical_skills"):
        html_parts.append('<div style="margin-bottom: 20px;">')
        html_parts.append('<h3 style="color: var(--arc-yellow); margin-bottom: 12px;">💻 TECHNICAL SKILLS</h3>')
        html_parts.append('<div style="display: flex; flex-wrap: wrap; gap: 8px;">')
        for skill in skills_data["technical_skills"]:
            html_parts.append(
                f'<span style="background: var(--bg-panel); border: 1px solid var(--border-dim); '
                f'padding: 6px 12px; border-radius: 4px; color: var(--arc-green); font-weight: 600; '
                f'font-size: 13px;">{skill}</span>'
            )
        html_parts.append('</div></div>')
    
    # Soft Skills
    if skills_data.get("soft_skills"):
        html_parts.append('<div style="margin-bottom: 20px;">')
        html_parts.append('<h3 style="color: var(--arc-yellow); margin-bottom: 12px;">🤝 SOFT SKILLS</h3>')
        html_parts.append('<div style="display: flex; flex-wrap: wrap; gap: 8px;">')
        for skill in skills_data["soft_skills"]:
            html_parts.append(
                f'<span style="background: var(--bg-panel); border: 1px solid var(--border-dim); '
                f'padding: 6px 12px; border-radius: 4px; color: var(--arc-cyan); font-weight: 600; '
                f'font-size: 13px;">{skill}</span>'
            )
        html_parts.append('</div></div>')
    
    # Experience & Roles
    if skills_data.get("experience_years") or skills_data.get("recent_roles"):
        html_parts.append('<div style="margin-bottom: 20px;">')
        html_parts.append('<h3 style="color: var(--arc-yellow); margin-bottom: 12px;">💼 EXPERIENCE</h3>')
        if skills_data.get("experience_years"):
            html_parts.append(f'<p style="color: var(--text-main); margin: 8px 0;"><strong>Years:</strong> {skills_data["experience_years"]}</p>')
        if skills_data.get("recent_roles"):
            html_parts.append('<p style="color: var(--text-main); margin: 8px 0;"><strong>Recent Roles:</strong></p>')
            html_parts.append('<ul style="color: var(--text-dim); margin-top: 4px;">')
            for role in skills_data["recent_roles"]:
                html_parts.append(f'<li>{role}</li>')
            html_parts.append('</ul>')
        html_parts.append('</div>')
    
    # Education
    if skills_data.get("education") or skills_data.get("certifications"):
        html_parts.append('<div style="margin-bottom: 20px;">')
        html_parts.append('<h3 style="color: var(--arc-yellow); margin-bottom: 12px;">🎓 EDUCATION & CERTIFICATIONS</h3>')
        if skills_data.get("education"):
            html_parts.append('<p style="color: var(--text-main); margin: 8px 0;"><strong>Education:</strong></p>')
            html_parts.append('<ul style="color: var(--text-dim); margin-top: 4px;">')
            for edu in skills_data["education"]:
                html_parts.append(f'<li>{edu}</li>')
            html_parts.append('</ul>')
        if skills_data.get("certifications"):
            html_parts.append('<p style="color: var(--text-main); margin: 8px 0;"><strong>Certifications:</strong></p>')
            html_parts.append('<ul style="color: var(--text-dim); margin-top: 4px;">')
            for cert in skills_data["certifications"]:
                html_parts.append(f'<li>{cert}</li>')
            html_parts.append('</ul>')
        html_parts.append('</div>')
    
    # Domain Expertise
    if skills_data.get("domain_expertise"):
        html_parts.append('<div style="margin-bottom: 20px;">')
        html_parts.append('<h3 style="color: var(--arc-yellow); margin-bottom: 12px;">🎯 DOMAIN EXPERTISE</h3>')
        html_parts.append('<div style="display: flex; flex-wrap: wrap; gap: 8px;">')
        for domain in skills_data["domain_expertise"]:
            html_parts.append(
                f'<span style="background: var(--bg-panel); border: 1px solid var(--border-dim); '
                f'padding: 6px 12px; border-radius: 4px; color: var(--arc-orange); font-weight: 600; '
                f'font-size: 13px;">{domain}</span>'
            )
        html_parts.append('</div></div>')
    
    html_parts.append('</div>')
    
    return ''.join(html_parts)