File size: 4,977 Bytes
07273d8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
"""CV Parser utility for extracting text from PDF and DOCX files."""

from __future__ import annotations

import io
from pathlib import Path
from typing import Optional


def _print_terminal_log(action: str, details: str = ""):
    """Print formatted log to terminal."""
    timestamp = __import__('datetime').datetime.now().strftime("%H:%M:%S")
    if details:
        print(f"[{timestamp}] [CV PARSER] {action} :: {details}")
    else:
        print(f"[{timestamp}] [CV PARSER] {action}")


def extract_text_from_pdf(file_content: bytes) -> str:
    """Extract text from PDF file content."""
    try:
        import pdfplumber
        
        text_content = []
        with pdfplumber.open(io.BytesIO(file_content)) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text_content.append(page_text)
        
        return "\n\n".join(text_content)
    except Exception as e:
        raise ValueError(f"Failed to extract text from PDF: {str(e)}")


def extract_text_from_docx(file_content: bytes) -> str:
    """Extract text from DOCX file content."""
    try:
        from docx import Document
        
        doc = Document(io.BytesIO(file_content))
        text_content = []
        
        for paragraph in doc.paragraphs:
            if paragraph.text.strip():
                text_content.append(paragraph.text)
        
        # Also extract text from tables
        for table in doc.tables:
            for row in table.rows:
                for cell in row.cells:
                    if cell.text.strip():
                        text_content.append(cell.text)
        
        return "\n".join(text_content)
    except Exception as e:
        raise ValueError(f"Failed to extract text from DOCX: {str(e)}")


def parse_cv(file_path: str | None = None, file_content: bytes | None = None, log_callback=None) -> str:
    """
    Parse CV file and extract text content.
    
    Args:
        file_path: Path to the CV file (optional if file_content is provided)
        file_content: Binary content of the file (optional if file_path is provided)
    
    Returns:
        Extracted text from the CV
    
    Raises:
        ValueError: If file format is not supported or parsing fails
    """
    if file_content is None and file_path is None:
        raise ValueError("Either file_path or file_content must be provided")
    
    # Read file if path is provided
    if file_content is None and file_path:
        with open(file_path, "rb") as f:
            file_content = f.read()
    
    # Determine file type
    if file_path:
        file_extension = Path(file_path).suffix.lower()
        if log_callback:
            log_callback("File Type Detection", {"method": "filename", "extension": file_extension})
    else:
        # Try to detect from content
        if file_content and file_content[:4] == b'%PDF':
            file_extension = '.pdf'
        elif file_content and file_content[:2] == b'PK':  # ZIP-based format (DOCX)
            file_extension = '.docx'
        else:
            if log_callback:
                log_callback("⚠️ File Type Detection Failed", {"reason": "Unknown file signature"})
            raise ValueError("Could not determine file type. Please provide a PDF or DOCX file.")
        
        if log_callback:
            log_callback("File Type Detection", {"method": "content signature", "extension": file_extension})
    
    # Extract text based on file type
    if file_extension == '.pdf':
        _print_terminal_log("PDF Parsing Started", f"Extracting text from PDF file...")
        
        if log_callback:
            log_callback("PDF Parser", {"status": "Starting PDF text extraction..."})
        
        text = extract_text_from_pdf(file_content)
        pages = text.count('\n\n') + 1
        
        _print_terminal_log("PDF Extraction Complete", f"Extracted {pages} pages, {len(text)} characters")
        
        if log_callback:
            log_callback("PDF Extraction Complete", {"pages_extracted": pages})
        return text
    elif file_extension in ['.docx', '.doc']:
        _print_terminal_log("DOCX Parsing Started", f"Extracting text from DOCX file...")
        
        if log_callback:
            log_callback("DOCX Parser", {"status": "Starting DOCX text extraction..."})
        
        text = extract_text_from_docx(file_content)
        paragraphs = len([p for p in text.split('\n') if p.strip()])
        
        _print_terminal_log("DOCX Extraction Complete", f"Extracted {paragraphs} paragraphs, {len(text)} characters")
        
        if log_callback:
            log_callback("DOCX Extraction Complete", {"paragraphs": paragraphs})
        return text
    else:
        if log_callback:
            log_callback("⚠️ Unsupported Format", {"extension": file_extension})
        raise ValueError(f"Unsupported file format: {file_extension}. Please upload a PDF or DOCX file.")