Spaces:

MCP-1st-Birthday
/

EastSync-AI

Running

EastSync-AI / utils /cv_parser.py

Daniel Tatar

Cv reader + matching project (#13)

07273d8 15 days ago

4.98 kB

	"""CV Parser utility for extracting text from PDF and DOCX files."""

	from __future__ import annotations

	import io
	from pathlib import Path
	from typing import Optional


	def _print_terminal_log(action: str, details: str = ""):
	"""Print formatted log to terminal."""
	timestamp = __import__('datetime').datetime.now().strftime("%H:%M:%S")
	if details:
	print(f"[{timestamp}] [CV PARSER] {action} :: {details}")
	else:
	print(f"[{timestamp}] [CV PARSER] {action}")


	def extract_text_from_pdf(file_content: bytes) -> str:
	"""Extract text from PDF file content."""
	try:
	import pdfplumber

	text_content = []
	with pdfplumber.open(io.BytesIO(file_content)) as pdf:
	for page in pdf.pages:
	page_text = page.extract_text()
	if page_text:
	text_content.append(page_text)

	return "\n\n".join(text_content)
	except Exception as e:
	raise ValueError(f"Failed to extract text from PDF: {str(e)}")


	def extract_text_from_docx(file_content: bytes) -> str:
	"""Extract text from DOCX file content."""
	try:
	from docx import Document

	doc = Document(io.BytesIO(file_content))
	text_content = []

	for paragraph in doc.paragraphs:
	if paragraph.text.strip():
	text_content.append(paragraph.text)

	# Also extract text from tables
	for table in doc.tables:
	for row in table.rows:
	for cell in row.cells:
	if cell.text.strip():
	text_content.append(cell.text)

	return "\n".join(text_content)
	except Exception as e:
	raise ValueError(f"Failed to extract text from DOCX: {str(e)}")


	def parse_cv(file_path: str \| None = None, file_content: bytes \| None = None, log_callback=None) -> str:
	"""
	Parse CV file and extract text content.

	Args:
	file_path: Path to the CV file (optional if file_content is provided)
	file_content: Binary content of the file (optional if file_path is provided)

	Returns:
	Extracted text from the CV

	Raises:
	ValueError: If file format is not supported or parsing fails
	"""
	if file_content is None and file_path is None:
	raise ValueError("Either file_path or file_content must be provided")

	# Read file if path is provided
	if file_content is None and file_path:
	with open(file_path, "rb") as f:
	file_content = f.read()

	# Determine file type
	if file_path:
	file_extension = Path(file_path).suffix.lower()
	if log_callback:
	log_callback("File Type Detection", {"method": "filename", "extension": file_extension})
	else:
	# Try to detect from content
	if file_content and file_content[:4] == b'%PDF':
	file_extension = '.pdf'
	elif file_content and file_content[:2] == b'PK': # ZIP-based format (DOCX)
	file_extension = '.docx'
	else:
	if log_callback:
	log_callback("⚠️ File Type Detection Failed", {"reason": "Unknown file signature"})
	raise ValueError("Could not determine file type. Please provide a PDF or DOCX file.")

	if log_callback:
	log_callback("File Type Detection", {"method": "content signature", "extension": file_extension})

	# Extract text based on file type
	if file_extension == '.pdf':
	_print_terminal_log("PDF Parsing Started", f"Extracting text from PDF file...")

	if log_callback:
	log_callback("PDF Parser", {"status": "Starting PDF text extraction..."})

	text = extract_text_from_pdf(file_content)
	pages = text.count('\n\n') + 1

	_print_terminal_log("PDF Extraction Complete", f"Extracted {pages} pages, {len(text)} characters")

	if log_callback:
	log_callback("PDF Extraction Complete", {"pages_extracted": pages})
	return text
	elif file_extension in ['.docx', '.doc']:
	_print_terminal_log("DOCX Parsing Started", f"Extracting text from DOCX file...")

	if log_callback:
	log_callback("DOCX Parser", {"status": "Starting DOCX text extraction..."})

	text = extract_text_from_docx(file_content)
	paragraphs = len([p for p in text.split('\n') if p.strip()])

	_print_terminal_log("DOCX Extraction Complete", f"Extracted {paragraphs} paragraphs, {len(text)} characters")

	if log_callback:
	log_callback("DOCX Extraction Complete", {"paragraphs": paragraphs})
	return text
	else:
	if log_callback:
	log_callback("⚠️ Unsupported Format", {"extension": file_extension})
	raise ValueError(f"Unsupported file format: {file_extension}. Please upload a PDF or DOCX file.")