Spaces:

MCP-1st-Birthday
/

EastSync-AI

Running

EastSync-AI / utils /skill_extractor.py

Daniel Tatar

Cv reader + matching project (#13)

07273d8 17 days ago

12.3 kB

	"""Skill extraction from CV text using LLM."""

	from __future__ import annotations

	from typing import Any, Dict, List

	from LLM.llm_models import cv_analyzer_model # Using CV-specific model (same as orchestrator)
	import json
	import re


	def _print_terminal_log(action: str, details: str = ""):
	"""Print formatted log to terminal."""
	timestamp = __import__('datetime').datetime.now().strftime("%H:%M:%S")
	if details:
	print(f"[{timestamp}] [CV ANALYZER] {action} :: {details}")
	else:
	print(f"[{timestamp}] [CV ANALYZER] {action}")


	def extract_skills_from_cv_text(cv_text: str, log_callback=None) -> Dict[str, Any]:
	"""
	Extract skills and relevant information from CV text using LLM.

	Args:
	cv_text: The extracted text content from a CV

	Returns:
	Dictionary containing extracted skills and candidate information
	"""

	prompt = f"""Analyze the following CV/Resume text and extract ALL relevant information in a structured format.

	CV TEXT:
	{cv_text}

	Please extract and organize the following information:

	1. TECHNICAL SKILLS: Programming languages, frameworks, tools, technologies
	2. SOFT SKILLS: Communication, leadership, teamwork, problem-solving, etc.
	3. PROFESSIONAL EXPERIENCE: Years of experience, job titles, companies
	4. EDUCATION: Degrees, certifications, institutions
	5. DOMAIN EXPERTISE: Industries, specific domains (e.g., Finance, Healthcare, AI/ML)

	Return your analysis in the following JSON-like structure:
	{{
	"technical_skills": ["skill1", "skill2", ...],
	"soft_skills": ["skill1", "skill2", ...],
	"experience_years": <number or "unknown">,
	"recent_roles": ["role1", "role2", ...],
	"education": ["degree1", "degree2", ...],
	"certifications": ["cert1", "cert2", ...],
	"domain_expertise": ["domain1", "domain2", ...],
	"summary": "A brief 2-3 sentence summary of the candidate's profile"
	}}

	Be thorough and extract as many relevant skills as possible. If information is not available, use empty arrays or "unknown"."""

	try:
	_print_terminal_log("Starting AI skill extraction from CV text")

	if log_callback:
	log_callback("AI Skill Extraction", {"status": "Initializing LLM model..."})

	# Use the CV analyzer model (same provider as orchestrator - HF/Gemini)
	model = cv_analyzer_model

	_print_terminal_log("LLM Initialized", f"Model ready, CV length: {len(cv_text)} chars")

	if log_callback:
	log_callback("AI Analysis", {"status": "Sending CV to AI for analysis", "cv_length": len(cv_text)})

	messages = [
	{
	"role": "system",
	"content": "You are an expert HR analyst specializing in CV/Resume analysis and skill extraction. Extract information accurately and comprehensively."
	},
	{
	"role": "user",
	"content": prompt
	}
	]

	if log_callback:
	log_callback("LLM Request", {"message_count": len(messages), "model": "cv_analyzer_model"})

	_print_terminal_log("Sending request to AI", "Waiting for skill extraction...")

	response = model.generate(messages=messages)

	# Handle ChatMessage object - convert to string
	if hasattr(response, 'content'):
	response_text = response.content
	else:
	response_text = str(response)

	_print_terminal_log("AI Response Received", f"Response length: {len(response_text) if response_text else 0} chars")

	if log_callback:
	log_callback("AI Response Received", {"response_length": len(response_text) if response_text else 0})

	# Extract JSON from response (handle markdown code blocks)

	if log_callback:
	log_callback("Parsing AI Response", {"status": "Extracting structured data from AI response"})

	json_match = re.search(r'\{[\s\S]*\}', response_text)

	if json_match:
	if log_callback:
	log_callback("JSON Extraction", {"status": "Found JSON in response, parsing..."})

	_print_terminal_log("Parsing JSON response", "Extracting structured skill data...")

	skills_data = json.loads(json_match.group())

	tech_count = len(skills_data.get("technical_skills", []))
	soft_count = len(skills_data.get("soft_skills", []))

	_print_terminal_log("Skills Extracted Successfully",
	f"Technical: {tech_count}, Soft: {soft_count}, Total: {tech_count + soft_count}")

	if log_callback:
	log_callback("Skills Parsed Successfully", {
	"technical_skills": tech_count,
	"soft_skills": soft_count,
	"total_skills": tech_count + soft_count
	})
	else:
	if log_callback:
	log_callback("JSON Extraction Failed", {"status": "No JSON found, using fallback structure"})
	# Fallback: return a basic structure with the raw response
	skills_data = {
	"technical_skills": [],
	"soft_skills": [],
	"experience_years": "unknown",
	"recent_roles": [],
	"education": [],
	"certifications": [],
	"domain_expertise": [],
	"summary": response_text[:500] # First 500 chars
	}

	_print_terminal_log("✅ CV Analysis Complete", "All skills successfully extracted and structured")

	if log_callback:
	log_callback("✅ Extraction Complete", {"status": "CV processing finished successfully"})

	return skills_data

	except Exception as e:
	error_msg = str(e)

	_print_terminal_log(f"❌ ERROR: {type(e).__name__}", error_msg)

	if log_callback:
	log_callback("❌ AI Extraction Error", {"error": error_msg, "type": type(e).__name__})

	# Return error information
	return {
	"error": error_msg,
	"technical_skills": [],
	"soft_skills": [],
	"experience_years": "unknown",
	"recent_roles": [],
	"education": [],
	"certifications": [],
	"domain_expertise": [],
	"summary": f"Failed to extract skills: {error_msg}"
	}


	def format_skills_for_display(skills_data: Dict[str, Any]) -> str:
	"""
	Format extracted skills data into HTML for display in Gradio.

	Args:
	skills_data: Dictionary containing extracted skills

	Returns:
	HTML string for display
	"""

	if "error" in skills_data:
	return f"""
	<div style="padding: 20px; background: var(--bg-card); border: 1px solid var(--arc-red); border-radius: 4px;">
	<h3 style="color: var(--arc-red); margin-top: 0;">⚠️ Error Extracting Skills</h3>
	<p style="color: var(--text-dim);">{skills_data.get('summary', 'Unknown error')}</p>
	</div>
	"""

	html_parts = [
	'<div style="padding: 24px; background: var(--bg-card); border: 1px solid var(--border-bright); border-radius: 4px; margin-top: 20px;">',
	'<h2 style="color: var(--arc-orange); margin-top: 0; display: flex; align-items: center; gap: 12px;">',
	'<span style="font-size: 32px;">📄</span> CV ANALYSIS COMPLETE',
	'</h2>',
	]

	# Summary
	if skills_data.get("summary"):
	html_parts.append(f'<div style="background: var(--bg-panel); padding: 16px; border-left: 4px solid var(--arc-cyan); margin-bottom: 24px;">')
	html_parts.append(f'<p style="color: var(--text-main); margin: 0; line-height: 1.6;">{skills_data["summary"]}</p>')
	html_parts.append('</div>')

	# Technical Skills
	if skills_data.get("technical_skills"):
	html_parts.append('<div style="margin-bottom: 20px;">')
	html_parts.append('<h3 style="color: var(--arc-yellow); margin-bottom: 12px;">💻 TECHNICAL SKILLS</h3>')
	html_parts.append('<div style="display: flex; flex-wrap: wrap; gap: 8px;">')
	for skill in skills_data["technical_skills"]:
	html_parts.append(
	f'<span style="background: var(--bg-panel); border: 1px solid var(--border-dim); '
	f'padding: 6px 12px; border-radius: 4px; color: var(--arc-green); font-weight: 600; '
	f'font-size: 13px;">{skill}</span>'
	)
	html_parts.append('</div></div>')

	# Soft Skills
	if skills_data.get("soft_skills"):
	html_parts.append('<div style="margin-bottom: 20px;">')
	html_parts.append('<h3 style="color: var(--arc-yellow); margin-bottom: 12px;">🤝 SOFT SKILLS</h3>')
	html_parts.append('<div style="display: flex; flex-wrap: wrap; gap: 8px;">')
	for skill in skills_data["soft_skills"]:
	html_parts.append(
	f'<span style="background: var(--bg-panel); border: 1px solid var(--border-dim); '
	f'padding: 6px 12px; border-radius: 4px; color: var(--arc-cyan); font-weight: 600; '
	f'font-size: 13px;">{skill}</span>'
	)
	html_parts.append('</div></div>')

	# Experience & Roles
	if skills_data.get("experience_years") or skills_data.get("recent_roles"):
	html_parts.append('<div style="margin-bottom: 20px;">')
	html_parts.append('<h3 style="color: var(--arc-yellow); margin-bottom: 12px;">💼 EXPERIENCE</h3>')
	if skills_data.get("experience_years"):
	html_parts.append(f'<p style="color: var(--text-main); margin: 8px 0;"><strong>Years:</strong> {skills_data["experience_years"]}</p>')
	if skills_data.get("recent_roles"):
	html_parts.append('<p style="color: var(--text-main); margin: 8px 0;"><strong>Recent Roles:</strong></p>')
	html_parts.append('<ul style="color: var(--text-dim); margin-top: 4px;">')
	for role in skills_data["recent_roles"]:
	html_parts.append(f'<li>{role}</li>')
	html_parts.append('</ul>')
	html_parts.append('</div>')

	# Education
	if skills_data.get("education") or skills_data.get("certifications"):
	html_parts.append('<div style="margin-bottom: 20px;">')
	html_parts.append('<h3 style="color: var(--arc-yellow); margin-bottom: 12px;">🎓 EDUCATION & CERTIFICATIONS</h3>')
	if skills_data.get("education"):
	html_parts.append('<p style="color: var(--text-main); margin: 8px 0;"><strong>Education:</strong></p>')
	html_parts.append('<ul style="color: var(--text-dim); margin-top: 4px;">')
	for edu in skills_data["education"]:
	html_parts.append(f'<li>{edu}</li>')
	html_parts.append('</ul>')
	if skills_data.get("certifications"):
	html_parts.append('<p style="color: var(--text-main); margin: 8px 0;"><strong>Certifications:</strong></p>')
	html_parts.append('<ul style="color: var(--text-dim); margin-top: 4px;">')
	for cert in skills_data["certifications"]:
	html_parts.append(f'<li>{cert}</li>')
	html_parts.append('</ul>')
	html_parts.append('</div>')

	# Domain Expertise
	if skills_data.get("domain_expertise"):
	html_parts.append('<div style="margin-bottom: 20px;">')
	html_parts.append('<h3 style="color: var(--arc-yellow); margin-bottom: 12px;">🎯 DOMAIN EXPERTISE</h3>')
	html_parts.append('<div style="display: flex; flex-wrap: wrap; gap: 8px;">')
	for domain in skills_data["domain_expertise"]:
	html_parts.append(
	f'<span style="background: var(--bg-panel); border: 1px solid var(--border-dim); '
	f'padding: 6px 12px; border-radius: 4px; color: var(--arc-orange); font-weight: 600; '
	f'font-size: 13px;">{domain}</span>'
	)
	html_parts.append('</div></div>')

	html_parts.append('</div>')

	return ''.join(html_parts)