Spaces:
Running
Running
File size: 4,977 Bytes
07273d8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
"""CV Parser utility for extracting text from PDF and DOCX files."""
from __future__ import annotations
import io
from pathlib import Path
from typing import Optional
def _print_terminal_log(action: str, details: str = ""):
"""Print formatted log to terminal."""
timestamp = __import__('datetime').datetime.now().strftime("%H:%M:%S")
if details:
print(f"[{timestamp}] [CV PARSER] {action} :: {details}")
else:
print(f"[{timestamp}] [CV PARSER] {action}")
def extract_text_from_pdf(file_content: bytes) -> str:
"""Extract text from PDF file content."""
try:
import pdfplumber
text_content = []
with pdfplumber.open(io.BytesIO(file_content)) as pdf:
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
text_content.append(page_text)
return "\n\n".join(text_content)
except Exception as e:
raise ValueError(f"Failed to extract text from PDF: {str(e)}")
def extract_text_from_docx(file_content: bytes) -> str:
"""Extract text from DOCX file content."""
try:
from docx import Document
doc = Document(io.BytesIO(file_content))
text_content = []
for paragraph in doc.paragraphs:
if paragraph.text.strip():
text_content.append(paragraph.text)
# Also extract text from tables
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
if cell.text.strip():
text_content.append(cell.text)
return "\n".join(text_content)
except Exception as e:
raise ValueError(f"Failed to extract text from DOCX: {str(e)}")
def parse_cv(file_path: str | None = None, file_content: bytes | None = None, log_callback=None) -> str:
"""
Parse CV file and extract text content.
Args:
file_path: Path to the CV file (optional if file_content is provided)
file_content: Binary content of the file (optional if file_path is provided)
Returns:
Extracted text from the CV
Raises:
ValueError: If file format is not supported or parsing fails
"""
if file_content is None and file_path is None:
raise ValueError("Either file_path or file_content must be provided")
# Read file if path is provided
if file_content is None and file_path:
with open(file_path, "rb") as f:
file_content = f.read()
# Determine file type
if file_path:
file_extension = Path(file_path).suffix.lower()
if log_callback:
log_callback("File Type Detection", {"method": "filename", "extension": file_extension})
else:
# Try to detect from content
if file_content and file_content[:4] == b'%PDF':
file_extension = '.pdf'
elif file_content and file_content[:2] == b'PK': # ZIP-based format (DOCX)
file_extension = '.docx'
else:
if log_callback:
log_callback("⚠️ File Type Detection Failed", {"reason": "Unknown file signature"})
raise ValueError("Could not determine file type. Please provide a PDF or DOCX file.")
if log_callback:
log_callback("File Type Detection", {"method": "content signature", "extension": file_extension})
# Extract text based on file type
if file_extension == '.pdf':
_print_terminal_log("PDF Parsing Started", f"Extracting text from PDF file...")
if log_callback:
log_callback("PDF Parser", {"status": "Starting PDF text extraction..."})
text = extract_text_from_pdf(file_content)
pages = text.count('\n\n') + 1
_print_terminal_log("PDF Extraction Complete", f"Extracted {pages} pages, {len(text)} characters")
if log_callback:
log_callback("PDF Extraction Complete", {"pages_extracted": pages})
return text
elif file_extension in ['.docx', '.doc']:
_print_terminal_log("DOCX Parsing Started", f"Extracting text from DOCX file...")
if log_callback:
log_callback("DOCX Parser", {"status": "Starting DOCX text extraction..."})
text = extract_text_from_docx(file_content)
paragraphs = len([p for p in text.split('\n') if p.strip()])
_print_terminal_log("DOCX Extraction Complete", f"Extracted {paragraphs} paragraphs, {len(text)} characters")
if log_callback:
log_callback("DOCX Extraction Complete", {"paragraphs": paragraphs})
return text
else:
if log_callback:
log_callback("⚠️ Unsupported Format", {"extension": file_extension})
raise ValueError(f"Unsupported file format: {file_extension}. Please upload a PDF or DOCX file.")
|