Spaces:
Running
Running
| """CV Parser utility for extracting text from PDF and DOCX files.""" | |
| from __future__ import annotations | |
| import io | |
| from pathlib import Path | |
| from typing import Optional | |
| def _print_terminal_log(action: str, details: str = ""): | |
| """Print formatted log to terminal.""" | |
| timestamp = __import__('datetime').datetime.now().strftime("%H:%M:%S") | |
| if details: | |
| print(f"[{timestamp}] [CV PARSER] {action} :: {details}") | |
| else: | |
| print(f"[{timestamp}] [CV PARSER] {action}") | |
| def extract_text_from_pdf(file_content: bytes) -> str: | |
| """Extract text from PDF file content.""" | |
| try: | |
| import pdfplumber | |
| text_content = [] | |
| with pdfplumber.open(io.BytesIO(file_content)) as pdf: | |
| for page in pdf.pages: | |
| page_text = page.extract_text() | |
| if page_text: | |
| text_content.append(page_text) | |
| return "\n\n".join(text_content) | |
| except Exception as e: | |
| raise ValueError(f"Failed to extract text from PDF: {str(e)}") | |
| def extract_text_from_docx(file_content: bytes) -> str: | |
| """Extract text from DOCX file content.""" | |
| try: | |
| from docx import Document | |
| doc = Document(io.BytesIO(file_content)) | |
| text_content = [] | |
| for paragraph in doc.paragraphs: | |
| if paragraph.text.strip(): | |
| text_content.append(paragraph.text) | |
| # Also extract text from tables | |
| for table in doc.tables: | |
| for row in table.rows: | |
| for cell in row.cells: | |
| if cell.text.strip(): | |
| text_content.append(cell.text) | |
| return "\n".join(text_content) | |
| except Exception as e: | |
| raise ValueError(f"Failed to extract text from DOCX: {str(e)}") | |
| def parse_cv(file_path: str | None = None, file_content: bytes | None = None, log_callback=None) -> str: | |
| """ | |
| Parse CV file and extract text content. | |
| Args: | |
| file_path: Path to the CV file (optional if file_content is provided) | |
| file_content: Binary content of the file (optional if file_path is provided) | |
| Returns: | |
| Extracted text from the CV | |
| Raises: | |
| ValueError: If file format is not supported or parsing fails | |
| """ | |
| if file_content is None and file_path is None: | |
| raise ValueError("Either file_path or file_content must be provided") | |
| # Read file if path is provided | |
| if file_content is None and file_path: | |
| with open(file_path, "rb") as f: | |
| file_content = f.read() | |
| # Determine file type | |
| if file_path: | |
| file_extension = Path(file_path).suffix.lower() | |
| if log_callback: | |
| log_callback("File Type Detection", {"method": "filename", "extension": file_extension}) | |
| else: | |
| # Try to detect from content | |
| if file_content and file_content[:4] == b'%PDF': | |
| file_extension = '.pdf' | |
| elif file_content and file_content[:2] == b'PK': # ZIP-based format (DOCX) | |
| file_extension = '.docx' | |
| else: | |
| if log_callback: | |
| log_callback("⚠️ File Type Detection Failed", {"reason": "Unknown file signature"}) | |
| raise ValueError("Could not determine file type. Please provide a PDF or DOCX file.") | |
| if log_callback: | |
| log_callback("File Type Detection", {"method": "content signature", "extension": file_extension}) | |
| # Extract text based on file type | |
| if file_extension == '.pdf': | |
| _print_terminal_log("PDF Parsing Started", f"Extracting text from PDF file...") | |
| if log_callback: | |
| log_callback("PDF Parser", {"status": "Starting PDF text extraction..."}) | |
| text = extract_text_from_pdf(file_content) | |
| pages = text.count('\n\n') + 1 | |
| _print_terminal_log("PDF Extraction Complete", f"Extracted {pages} pages, {len(text)} characters") | |
| if log_callback: | |
| log_callback("PDF Extraction Complete", {"pages_extracted": pages}) | |
| return text | |
| elif file_extension in ['.docx', '.doc']: | |
| _print_terminal_log("DOCX Parsing Started", f"Extracting text from DOCX file...") | |
| if log_callback: | |
| log_callback("DOCX Parser", {"status": "Starting DOCX text extraction..."}) | |
| text = extract_text_from_docx(file_content) | |
| paragraphs = len([p for p in text.split('\n') if p.strip()]) | |
| _print_terminal_log("DOCX Extraction Complete", f"Extracted {paragraphs} paragraphs, {len(text)} characters") | |
| if log_callback: | |
| log_callback("DOCX Extraction Complete", {"paragraphs": paragraphs}) | |
| return text | |
| else: | |
| if log_callback: | |
| log_callback("⚠️ Unsupported Format", {"extension": file_extension}) | |
| raise ValueError(f"Unsupported file format: {file_extension}. Please upload a PDF or DOCX file.") | |