Spaces:

hellorahulk
/

docling_free

Running

App Files Files Community

docling_free / dockling_parser /parser.py

hellorahulk

Improve error handling and file processing

fdbfd73 11 months ago

raw

history blame contribute delete

6.34 kB

	import os
	from pathlib import Path
	from typing import Optional, Dict, Any, Union
	import magic
	from docling.document_converter import DocumentConverter
	from datetime import datetime
	import shutil
	import tempfile

	from .types import ParsedDocument, DocumentMetadata
	from .exceptions import UnsupportedFormatError, ParseError

	class DocumentParser:
	"""
	A multiformat document parser using Docling
	"""

	SUPPORTED_FORMATS = {
	'application/pdf': 'pdf',
	'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
	'text/plain': 'txt',
	'text/html': 'html',
	'text/markdown': 'md',
	# Add common variations
	'application/x-pdf': 'pdf',
	'application/acrobat': 'pdf',
	'application/msword': 'docx',
	'text/x-markdown': 'md',
	'text/x-html': 'html'
	}

	EXTENSION_TO_MIME = {
	'.pdf': 'application/pdf',
	'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
	'.txt': 'text/plain',
	'.html': 'text/html',
	'.htm': 'text/html',
	'.md': 'text/markdown',
	'.markdown': 'text/markdown'
	}

	def __init__(self, config: Optional[Dict[str, Any]] = None):
	self.config = config or {}
	self.converter = DocumentConverter()
	# Create a temporary directory for processing files
	self.temp_dir = Path(tempfile.mkdtemp(prefix="dockling_"))

	def __del__(self):
	"""Cleanup temporary directory on object destruction"""
	if hasattr(self, 'temp_dir') and self.temp_dir.exists():
	shutil.rmtree(self.temp_dir, ignore_errors=True)

	def _validate_and_copy_file(self, file_path: Union[str, Path]) -> Path:
	"""
	Validate file and copy to temporary location with correct extension
	"""
	file_path = Path(file_path)
	if not file_path.exists():
	raise FileNotFoundError(f"File not found: {file_path}")

	# Try to determine format from extension first
	extension = file_path.suffix.lower()
	mime_type = self.EXTENSION_TO_MIME.get(extension)

	# If extension not recognized, use magic
	if not mime_type:
	mime_type = magic.from_file(str(file_path), mime=True)
	if mime_type in self.SUPPORTED_FORMATS:
	extension = f".{self.SUPPORTED_FORMATS[mime_type]}"
	else:
	raise UnsupportedFormatError(
	f"Unsupported file format: {mime_type}. "
	f"Supported formats are: {', '.join(set(self.SUPPORTED_FORMATS.values()))}"
	)

	# Copy file to temp directory with correct extension
	temp_file = self.temp_dir / f"doc{extension}"
	shutil.copy2(file_path, temp_file)
	return temp_file

	def parse(self, file_path: Union[str, Path]) -> ParsedDocument:
	"""
	Parse a document file and return structured content

	Args:
	file_path: Path to the document file

	Returns:
	ParsedDocument object containing parsed content and metadata

	Raises:
	UnsupportedFormatError: If the file format is not supported
	ParseError: If parsing fails
	"""
	try:
	# Validate and prepare file
	temp_file = self._validate_and_copy_file(file_path)

	# Get file metadata
	stats = temp_file.stat()
	mime_type = magic.from_file(str(temp_file), mime=True)

	metadata = DocumentMetadata(
	filename=Path(file_path).name, # Use original filename
	file_type=self.SUPPORTED_FORMATS[mime_type],
	size_bytes=stats.st_size,
	created_at=datetime.fromtimestamp(stats.st_ctime),
	modified_at=datetime.fromtimestamp(stats.st_mtime),
	mime_type=mime_type
	)

	try:
	# Parse document using Docling
	result = self.converter.convert(str(temp_file))
	doc = result.document

	# Extract content using proper methods
	try:
	content = doc.export_to_text()
	except Exception as e:
	raise ParseError(f"Failed to extract text content: {str(e)}")

	# Extract structured content
	structured_content = {
	'sections': doc.sections if hasattr(doc, 'sections') else [],
	'paragraphs': doc.paragraphs if hasattr(doc, 'paragraphs') else [],
	'entities': doc.entities if hasattr(doc, 'entities') else {},
	'metadata': doc.metadata if hasattr(doc, 'metadata') else {}
	}

	# Get raw text if available
	try:
	raw_text = doc.export_to_text(include_layout=True)
	except:
	raw_text = content

	# Update metadata with document-specific information
	if hasattr(doc, 'metadata') and doc.metadata:
	metadata.title = doc.metadata.get('title')
	metadata.author = doc.metadata.get('author')
	metadata.pages = doc.metadata.get('pages')
	metadata.extra.update(doc.metadata)

	return ParsedDocument(
	content=content,
	metadata=metadata,
	raw_text=raw_text,
	structured_content=structured_content,
	confidence_score=getattr(doc, 'confidence', 1.0)
	)

	except Exception as e:
	raise ParseError(f"Failed to parse document: {str(e)}")

	except Exception as e:
	raise ParseError(str(e))

	finally:
	# Cleanup temporary files
	if 'temp_file' in locals() and temp_file.exists():
	try:
	temp_file.unlink()
	except:
	pass

	def supports_format(self, mime_type: str) -> bool:
	"""Check if a given MIME type is supported"""
	return mime_type in self.SUPPORTED_FORMATS