EastSync-AI / utils /skill_extractor.py
Daniel Tatar
Cv reader + matching project (#13)
07273d8
"""Skill extraction from CV text using LLM."""
from __future__ import annotations
from typing import Any, Dict, List
from LLM.llm_models import cv_analyzer_model # Using CV-specific model (same as orchestrator)
import json
import re
def _print_terminal_log(action: str, details: str = ""):
"""Print formatted log to terminal."""
timestamp = __import__('datetime').datetime.now().strftime("%H:%M:%S")
if details:
print(f"[{timestamp}] [CV ANALYZER] {action} :: {details}")
else:
print(f"[{timestamp}] [CV ANALYZER] {action}")
def extract_skills_from_cv_text(cv_text: str, log_callback=None) -> Dict[str, Any]:
"""
Extract skills and relevant information from CV text using LLM.
Args:
cv_text: The extracted text content from a CV
Returns:
Dictionary containing extracted skills and candidate information
"""
prompt = f"""Analyze the following CV/Resume text and extract ALL relevant information in a structured format.
CV TEXT:
{cv_text}
Please extract and organize the following information:
1. TECHNICAL SKILLS: Programming languages, frameworks, tools, technologies
2. SOFT SKILLS: Communication, leadership, teamwork, problem-solving, etc.
3. PROFESSIONAL EXPERIENCE: Years of experience, job titles, companies
4. EDUCATION: Degrees, certifications, institutions
5. DOMAIN EXPERTISE: Industries, specific domains (e.g., Finance, Healthcare, AI/ML)
Return your analysis in the following JSON-like structure:
{{
"technical_skills": ["skill1", "skill2", ...],
"soft_skills": ["skill1", "skill2", ...],
"experience_years": <number or "unknown">,
"recent_roles": ["role1", "role2", ...],
"education": ["degree1", "degree2", ...],
"certifications": ["cert1", "cert2", ...],
"domain_expertise": ["domain1", "domain2", ...],
"summary": "A brief 2-3 sentence summary of the candidate's profile"
}}
Be thorough and extract as many relevant skills as possible. If information is not available, use empty arrays or "unknown"."""
try:
_print_terminal_log("Starting AI skill extraction from CV text")
if log_callback:
log_callback("AI Skill Extraction", {"status": "Initializing LLM model..."})
# Use the CV analyzer model (same provider as orchestrator - HF/Gemini)
model = cv_analyzer_model
_print_terminal_log("LLM Initialized", f"Model ready, CV length: {len(cv_text)} chars")
if log_callback:
log_callback("AI Analysis", {"status": "Sending CV to AI for analysis", "cv_length": len(cv_text)})
messages = [
{
"role": "system",
"content": "You are an expert HR analyst specializing in CV/Resume analysis and skill extraction. Extract information accurately and comprehensively."
},
{
"role": "user",
"content": prompt
}
]
if log_callback:
log_callback("LLM Request", {"message_count": len(messages), "model": "cv_analyzer_model"})
_print_terminal_log("Sending request to AI", "Waiting for skill extraction...")
response = model.generate(messages=messages)
# Handle ChatMessage object - convert to string
if hasattr(response, 'content'):
response_text = response.content
else:
response_text = str(response)
_print_terminal_log("AI Response Received", f"Response length: {len(response_text) if response_text else 0} chars")
if log_callback:
log_callback("AI Response Received", {"response_length": len(response_text) if response_text else 0})
# Extract JSON from response (handle markdown code blocks)
if log_callback:
log_callback("Parsing AI Response", {"status": "Extracting structured data from AI response"})
json_match = re.search(r'\{[\s\S]*\}', response_text)
if json_match:
if log_callback:
log_callback("JSON Extraction", {"status": "Found JSON in response, parsing..."})
_print_terminal_log("Parsing JSON response", "Extracting structured skill data...")
skills_data = json.loads(json_match.group())
tech_count = len(skills_data.get("technical_skills", []))
soft_count = len(skills_data.get("soft_skills", []))
_print_terminal_log("Skills Extracted Successfully",
f"Technical: {tech_count}, Soft: {soft_count}, Total: {tech_count + soft_count}")
if log_callback:
log_callback("Skills Parsed Successfully", {
"technical_skills": tech_count,
"soft_skills": soft_count,
"total_skills": tech_count + soft_count
})
else:
if log_callback:
log_callback("JSON Extraction Failed", {"status": "No JSON found, using fallback structure"})
# Fallback: return a basic structure with the raw response
skills_data = {
"technical_skills": [],
"soft_skills": [],
"experience_years": "unknown",
"recent_roles": [],
"education": [],
"certifications": [],
"domain_expertise": [],
"summary": response_text[:500] # First 500 chars
}
_print_terminal_log("βœ… CV Analysis Complete", "All skills successfully extracted and structured")
if log_callback:
log_callback("βœ… Extraction Complete", {"status": "CV processing finished successfully"})
return skills_data
except Exception as e:
error_msg = str(e)
_print_terminal_log(f"❌ ERROR: {type(e).__name__}", error_msg)
if log_callback:
log_callback("❌ AI Extraction Error", {"error": error_msg, "type": type(e).__name__})
# Return error information
return {
"error": error_msg,
"technical_skills": [],
"soft_skills": [],
"experience_years": "unknown",
"recent_roles": [],
"education": [],
"certifications": [],
"domain_expertise": [],
"summary": f"Failed to extract skills: {error_msg}"
}
def format_skills_for_display(skills_data: Dict[str, Any]) -> str:
"""
Format extracted skills data into HTML for display in Gradio.
Args:
skills_data: Dictionary containing extracted skills
Returns:
HTML string for display
"""
if "error" in skills_data:
return f"""
<div style="padding: 20px; background: var(--bg-card); border: 1px solid var(--arc-red); border-radius: 4px;">
<h3 style="color: var(--arc-red); margin-top: 0;">⚠️ Error Extracting Skills</h3>
<p style="color: var(--text-dim);">{skills_data.get('summary', 'Unknown error')}</p>
</div>
"""
html_parts = [
'<div style="padding: 24px; background: var(--bg-card); border: 1px solid var(--border-bright); border-radius: 4px; margin-top: 20px;">',
'<h2 style="color: var(--arc-orange); margin-top: 0; display: flex; align-items: center; gap: 12px;">',
'<span style="font-size: 32px;">πŸ“„</span> CV ANALYSIS COMPLETE',
'</h2>',
]
# Summary
if skills_data.get("summary"):
html_parts.append(f'<div style="background: var(--bg-panel); padding: 16px; border-left: 4px solid var(--arc-cyan); margin-bottom: 24px;">')
html_parts.append(f'<p style="color: var(--text-main); margin: 0; line-height: 1.6;">{skills_data["summary"]}</p>')
html_parts.append('</div>')
# Technical Skills
if skills_data.get("technical_skills"):
html_parts.append('<div style="margin-bottom: 20px;">')
html_parts.append('<h3 style="color: var(--arc-yellow); margin-bottom: 12px;">πŸ’» TECHNICAL SKILLS</h3>')
html_parts.append('<div style="display: flex; flex-wrap: wrap; gap: 8px;">')
for skill in skills_data["technical_skills"]:
html_parts.append(
f'<span style="background: var(--bg-panel); border: 1px solid var(--border-dim); '
f'padding: 6px 12px; border-radius: 4px; color: var(--arc-green); font-weight: 600; '
f'font-size: 13px;">{skill}</span>'
)
html_parts.append('</div></div>')
# Soft Skills
if skills_data.get("soft_skills"):
html_parts.append('<div style="margin-bottom: 20px;">')
html_parts.append('<h3 style="color: var(--arc-yellow); margin-bottom: 12px;">🀝 SOFT SKILLS</h3>')
html_parts.append('<div style="display: flex; flex-wrap: wrap; gap: 8px;">')
for skill in skills_data["soft_skills"]:
html_parts.append(
f'<span style="background: var(--bg-panel); border: 1px solid var(--border-dim); '
f'padding: 6px 12px; border-radius: 4px; color: var(--arc-cyan); font-weight: 600; '
f'font-size: 13px;">{skill}</span>'
)
html_parts.append('</div></div>')
# Experience & Roles
if skills_data.get("experience_years") or skills_data.get("recent_roles"):
html_parts.append('<div style="margin-bottom: 20px;">')
html_parts.append('<h3 style="color: var(--arc-yellow); margin-bottom: 12px;">πŸ’Ό EXPERIENCE</h3>')
if skills_data.get("experience_years"):
html_parts.append(f'<p style="color: var(--text-main); margin: 8px 0;"><strong>Years:</strong> {skills_data["experience_years"]}</p>')
if skills_data.get("recent_roles"):
html_parts.append('<p style="color: var(--text-main); margin: 8px 0;"><strong>Recent Roles:</strong></p>')
html_parts.append('<ul style="color: var(--text-dim); margin-top: 4px;">')
for role in skills_data["recent_roles"]:
html_parts.append(f'<li>{role}</li>')
html_parts.append('</ul>')
html_parts.append('</div>')
# Education
if skills_data.get("education") or skills_data.get("certifications"):
html_parts.append('<div style="margin-bottom: 20px;">')
html_parts.append('<h3 style="color: var(--arc-yellow); margin-bottom: 12px;">πŸŽ“ EDUCATION & CERTIFICATIONS</h3>')
if skills_data.get("education"):
html_parts.append('<p style="color: var(--text-main); margin: 8px 0;"><strong>Education:</strong></p>')
html_parts.append('<ul style="color: var(--text-dim); margin-top: 4px;">')
for edu in skills_data["education"]:
html_parts.append(f'<li>{edu}</li>')
html_parts.append('</ul>')
if skills_data.get("certifications"):
html_parts.append('<p style="color: var(--text-main); margin: 8px 0;"><strong>Certifications:</strong></p>')
html_parts.append('<ul style="color: var(--text-dim); margin-top: 4px;">')
for cert in skills_data["certifications"]:
html_parts.append(f'<li>{cert}</li>')
html_parts.append('</ul>')
html_parts.append('</div>')
# Domain Expertise
if skills_data.get("domain_expertise"):
html_parts.append('<div style="margin-bottom: 20px;">')
html_parts.append('<h3 style="color: var(--arc-yellow); margin-bottom: 12px;">🎯 DOMAIN EXPERTISE</h3>')
html_parts.append('<div style="display: flex; flex-wrap: wrap; gap: 8px;">')
for domain in skills_data["domain_expertise"]:
html_parts.append(
f'<span style="background: var(--bg-panel); border: 1px solid var(--border-dim); '
f'padding: 6px 12px; border-radius: 4px; color: var(--arc-orange); font-weight: 600; '
f'font-size: 13px;">{domain}</span>'
)
html_parts.append('</div></div>')
html_parts.append('</div>')
return ''.join(html_parts)