Spaces:

MCP-1st-Birthday
/

EastSync-AI

Running

File size: 4,977 Bytes

07273d8

"""CV Parser utility for extracting text from PDF and DOCX files."""

from __future__ import annotations

import io
from pathlib import Path
from typing import Optional


def _print_terminal_log(action: str, details: str = ""):
    """Print formatted log to terminal."""
    timestamp = __import__('datetime').datetime.now().strftime("%H:%M:%S")
    if details:
        print(f"[{timestamp}] [CV PARSER] {action} :: {details}")
    else:
        print(f"[{timestamp}] [CV PARSER] {action}")


def extract_text_from_pdf(file_content: bytes) -> str:
    """Extract text from PDF file content."""
    try:
        import pdfplumber
        
        text_content = []
        with pdfplumber.open(io.BytesIO(file_content)) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text_content.append(page_text)
        
        return "\n\n".join(text_content)
    except Exception as e:
        raise ValueError(f"Failed to extract text from PDF: {str(e)}")


def extract_text_from_docx(file_content: bytes) -> str:
    """Extract text from DOCX file content."""
    try:
        from docx import Document
        
        doc = Document(io.BytesIO(file_content))
        text_content = []
        
        for paragraph in doc.paragraphs:
            if paragraph.text.strip():
                text_content.append(paragraph.text)
        
        # Also extract text from tables
        for table in doc.tables:
            for row in table.rows:
                for cell in row.cells:
                    if cell.text.strip():
                        text_content.append(cell.text)
        
        return "\n".join(text_content)
    except Exception as e:
        raise ValueError(f"Failed to extract text from DOCX: {str(e)}")


def parse_cv(file_path: str | None = None, file_content: bytes | None = None, log_callback=None) -> str:
    """
    Parse CV file and extract text content.
    
    Args:
        file_path: Path to the CV file (optional if file_content is provided)
        file_content: Binary content of the file (optional if file_path is provided)
    
    Returns:
        Extracted text from the CV
    
    Raises:
        ValueError: If file format is not supported or parsing fails
    """
    if file_content is None and file_path is None:
        raise ValueError("Either file_path or file_content must be provided")
    
    # Read file if path is provided
    if file_content is None and file_path:
        with open(file_path, "rb") as f:
            file_content = f.read()
    
    # Determine file type
    if file_path:
        file_extension = Path(file_path).suffix.lower()
        if log_callback:
            log_callback("File Type Detection", {"method": "filename", "extension": file_extension})
    else:
        # Try to detect from content
        if file_content and file_content[:4] == b'%PDF':
            file_extension = '.pdf'
        elif file_content and file_content[:2] == b'PK':  # ZIP-based format (DOCX)
            file_extension = '.docx'
        else:
            if log_callback:
                log_callback("⚠️ File Type Detection Failed", {"reason": "Unknown file signature"})
            raise ValueError("Could not determine file type. Please provide a PDF or DOCX file.")
        
        if log_callback:
            log_callback("File Type Detection", {"method": "content signature", "extension": file_extension})
    
    # Extract text based on file type
    if file_extension == '.pdf':
        _print_terminal_log("PDF Parsing Started", f"Extracting text from PDF file...")
        
        if log_callback:
            log_callback("PDF Parser", {"status": "Starting PDF text extraction..."})
        
        text = extract_text_from_pdf(file_content)
        pages = text.count('\n\n') + 1
        
        _print_terminal_log("PDF Extraction Complete", f"Extracted {pages} pages, {len(text)} characters")
        
        if log_callback:
            log_callback("PDF Extraction Complete", {"pages_extracted": pages})
        return text
    elif file_extension in ['.docx', '.doc']:
        _print_terminal_log("DOCX Parsing Started", f"Extracting text from DOCX file...")
        
        if log_callback:
            log_callback("DOCX Parser", {"status": "Starting DOCX text extraction..."})
        
        text = extract_text_from_docx(file_content)
        paragraphs = len([p for p in text.split('\n') if p.strip()])
        
        _print_terminal_log("DOCX Extraction Complete", f"Extracted {paragraphs} paragraphs, {len(text)} characters")
        
        if log_callback:
            log_callback("DOCX Extraction Complete", {"paragraphs": paragraphs})
        return text
    else:
        if log_callback:
            log_callback("⚠️ Unsupported Format", {"extension": file_extension})
        raise ValueError(f"Unsupported file format: {file_extension}. Please upload a PDF or DOCX file.")