"""CV Parser utility for extracting text from PDF and DOCX files.""" from __future__ import annotations import io from pathlib import Path from typing import Optional def _print_terminal_log(action: str, details: str = ""): """Print formatted log to terminal.""" timestamp = __import__('datetime').datetime.now().strftime("%H:%M:%S") if details: print(f"[{timestamp}] [CV PARSER] {action} :: {details}") else: print(f"[{timestamp}] [CV PARSER] {action}") def extract_text_from_pdf(file_content: bytes) -> str: """Extract text from PDF file content.""" try: import pdfplumber text_content = [] with pdfplumber.open(io.BytesIO(file_content)) as pdf: for page in pdf.pages: page_text = page.extract_text() if page_text: text_content.append(page_text) return "\n\n".join(text_content) except Exception as e: raise ValueError(f"Failed to extract text from PDF: {str(e)}") def extract_text_from_docx(file_content: bytes) -> str: """Extract text from DOCX file content.""" try: from docx import Document doc = Document(io.BytesIO(file_content)) text_content = [] for paragraph in doc.paragraphs: if paragraph.text.strip(): text_content.append(paragraph.text) # Also extract text from tables for table in doc.tables: for row in table.rows: for cell in row.cells: if cell.text.strip(): text_content.append(cell.text) return "\n".join(text_content) except Exception as e: raise ValueError(f"Failed to extract text from DOCX: {str(e)}") def parse_cv(file_path: str | None = None, file_content: bytes | None = None, log_callback=None) -> str: """ Parse CV file and extract text content. Args: file_path: Path to the CV file (optional if file_content is provided) file_content: Binary content of the file (optional if file_path is provided) Returns: Extracted text from the CV Raises: ValueError: If file format is not supported or parsing fails """ if file_content is None and file_path is None: raise ValueError("Either file_path or file_content must be provided") # Read file if path is provided if file_content is None and file_path: with open(file_path, "rb") as f: file_content = f.read() # Determine file type if file_path: file_extension = Path(file_path).suffix.lower() if log_callback: log_callback("File Type Detection", {"method": "filename", "extension": file_extension}) else: # Try to detect from content if file_content and file_content[:4] == b'%PDF': file_extension = '.pdf' elif file_content and file_content[:2] == b'PK': # ZIP-based format (DOCX) file_extension = '.docx' else: if log_callback: log_callback("⚠️ File Type Detection Failed", {"reason": "Unknown file signature"}) raise ValueError("Could not determine file type. Please provide a PDF or DOCX file.") if log_callback: log_callback("File Type Detection", {"method": "content signature", "extension": file_extension}) # Extract text based on file type if file_extension == '.pdf': _print_terminal_log("PDF Parsing Started", f"Extracting text from PDF file...") if log_callback: log_callback("PDF Parser", {"status": "Starting PDF text extraction..."}) text = extract_text_from_pdf(file_content) pages = text.count('\n\n') + 1 _print_terminal_log("PDF Extraction Complete", f"Extracted {pages} pages, {len(text)} characters") if log_callback: log_callback("PDF Extraction Complete", {"pages_extracted": pages}) return text elif file_extension in ['.docx', '.doc']: _print_terminal_log("DOCX Parsing Started", f"Extracting text from DOCX file...") if log_callback: log_callback("DOCX Parser", {"status": "Starting DOCX text extraction..."}) text = extract_text_from_docx(file_content) paragraphs = len([p for p in text.split('\n') if p.strip()]) _print_terminal_log("DOCX Extraction Complete", f"Extracted {paragraphs} paragraphs, {len(text)} characters") if log_callback: log_callback("DOCX Extraction Complete", {"paragraphs": paragraphs}) return text else: if log_callback: log_callback("⚠️ Unsupported Format", {"extension": file_extension}) raise ValueError(f"Unsupported file format: {file_extension}. Please upload a PDF or DOCX file.")