EastSync-AI / utils /cv_parser.py
Daniel Tatar
Cv reader + matching project (#13)
07273d8
"""CV Parser utility for extracting text from PDF and DOCX files."""
from __future__ import annotations
import io
from pathlib import Path
from typing import Optional
def _print_terminal_log(action: str, details: str = ""):
"""Print formatted log to terminal."""
timestamp = __import__('datetime').datetime.now().strftime("%H:%M:%S")
if details:
print(f"[{timestamp}] [CV PARSER] {action} :: {details}")
else:
print(f"[{timestamp}] [CV PARSER] {action}")
def extract_text_from_pdf(file_content: bytes) -> str:
"""Extract text from PDF file content."""
try:
import pdfplumber
text_content = []
with pdfplumber.open(io.BytesIO(file_content)) as pdf:
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
text_content.append(page_text)
return "\n\n".join(text_content)
except Exception as e:
raise ValueError(f"Failed to extract text from PDF: {str(e)}")
def extract_text_from_docx(file_content: bytes) -> str:
"""Extract text from DOCX file content."""
try:
from docx import Document
doc = Document(io.BytesIO(file_content))
text_content = []
for paragraph in doc.paragraphs:
if paragraph.text.strip():
text_content.append(paragraph.text)
# Also extract text from tables
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
if cell.text.strip():
text_content.append(cell.text)
return "\n".join(text_content)
except Exception as e:
raise ValueError(f"Failed to extract text from DOCX: {str(e)}")
def parse_cv(file_path: str | None = None, file_content: bytes | None = None, log_callback=None) -> str:
"""
Parse CV file and extract text content.
Args:
file_path: Path to the CV file (optional if file_content is provided)
file_content: Binary content of the file (optional if file_path is provided)
Returns:
Extracted text from the CV
Raises:
ValueError: If file format is not supported or parsing fails
"""
if file_content is None and file_path is None:
raise ValueError("Either file_path or file_content must be provided")
# Read file if path is provided
if file_content is None and file_path:
with open(file_path, "rb") as f:
file_content = f.read()
# Determine file type
if file_path:
file_extension = Path(file_path).suffix.lower()
if log_callback:
log_callback("File Type Detection", {"method": "filename", "extension": file_extension})
else:
# Try to detect from content
if file_content and file_content[:4] == b'%PDF':
file_extension = '.pdf'
elif file_content and file_content[:2] == b'PK': # ZIP-based format (DOCX)
file_extension = '.docx'
else:
if log_callback:
log_callback("⚠️ File Type Detection Failed", {"reason": "Unknown file signature"})
raise ValueError("Could not determine file type. Please provide a PDF or DOCX file.")
if log_callback:
log_callback("File Type Detection", {"method": "content signature", "extension": file_extension})
# Extract text based on file type
if file_extension == '.pdf':
_print_terminal_log("PDF Parsing Started", f"Extracting text from PDF file...")
if log_callback:
log_callback("PDF Parser", {"status": "Starting PDF text extraction..."})
text = extract_text_from_pdf(file_content)
pages = text.count('\n\n') + 1
_print_terminal_log("PDF Extraction Complete", f"Extracted {pages} pages, {len(text)} characters")
if log_callback:
log_callback("PDF Extraction Complete", {"pages_extracted": pages})
return text
elif file_extension in ['.docx', '.doc']:
_print_terminal_log("DOCX Parsing Started", f"Extracting text from DOCX file...")
if log_callback:
log_callback("DOCX Parser", {"status": "Starting DOCX text extraction..."})
text = extract_text_from_docx(file_content)
paragraphs = len([p for p in text.split('\n') if p.strip()])
_print_terminal_log("DOCX Extraction Complete", f"Extracted {paragraphs} paragraphs, {len(text)} characters")
if log_callback:
log_callback("DOCX Extraction Complete", {"paragraphs": paragraphs})
return text
else:
if log_callback:
log_callback("⚠️ Unsupported Format", {"extension": file_extension})
raise ValueError(f"Unsupported file format: {file_extension}. Please upload a PDF or DOCX file.")