Spaces:

mateenahmed
/

OCR

Sleeping

OCR / app.py

github-actions[bot]

🚀 Automated OCR deployment from GitHub Actions

cbca9e2 about 1 month ago

9.74 kB

	"""
	OCR Text Detection App
	======================
	Gradio app for OCR text extraction
	Features: File upload, URL upload, demo images, confidence filtering
	"""

	import cv2
	import easyocr
	import numpy as np
	from pathlib import Path
	import gradio as gr
	import urllib.request
	import tempfile
	import os
	import warnings
	from PIL import Image

	warnings.filterwarnings('ignore')

	# Initialize EasyOCR reader once (reused for all images)
	reader = easyocr.Reader(['en'], gpu=False, verbose=False)


	def format_text_aligned(results):
	"""Format OCR results by grouping text by Y-coordinate (lines) and sorting by X (left-to-right)."""
	if not results:
	return ""

	# Extract Y-center and X-min for each detection
	detections = [(sum(p[1] for p in bbox) / len(bbox), min(p[0] for p in bbox), text) for bbox, text, _ in results]
	if not detections:
	return ""

	# Calculate threshold to group detections on same line (30% of avg line spacing)
	y_coords = [d[0] for d in detections]
	y_threshold = (max(y_coords) - min(y_coords)) / len(set(int(y) for y in y_coords)) * 0.3

	# Sort by Y (top to bottom), then X (left to right)
	detections.sort(key=lambda x: (x[0], x[1]))
	lines, current_line, current_y = [], [], detections[0][0] if detections else 0

	# Group detections by similar Y coordinates into lines
	for y, x, text in detections:
	if abs(y - current_y) <= y_threshold:
	current_line.append((x, text))
	else:
	if current_line:
	lines.append(' '.join([t[1] for t in sorted(current_line, key=lambda x: x[0])]))
	current_line, current_y = [(x, text)], y

	if current_line:
	lines.append(' '.join([t[1] for t in sorted(current_line, key=lambda x: x[0])]))

	return '\n'.join(lines)


	def process_ocr(input_image, confidence_threshold=0.0):
	"""Process image with OCR and return annotated image + formatted text."""
	if input_image is None:
	return None, ""

	# Convert PIL Image to numpy array, then to BGR for OpenCV
	if isinstance(input_image, Image.Image):
	input_image = np.array(input_image)
	image_bgr = cv2.cvtColor(input_image, cv2.COLOR_RGB2BGR)

	# Perform OCR
	results = reader.readtext(image_bgr)

	# Filter by confidence threshold
	filtered_results = [(bbox, text, conf) for bbox, text, conf in results if conf >= confidence_threshold]
	formatted_text = format_text_aligned(filtered_results)

	# Draw bounding boxes and labels on image
	annotated_image = image_bgr.copy()
	for bbox, text, confidence in filtered_results:
	# Draw bounding box polygon
	bbox_points = np.array([[int(p[0]), int(p[1])] for p in bbox], dtype=np.int32)
	cv2.polylines(annotated_image, [bbox_points], isClosed=True, color=(0, 255, 0), thickness=2)

	# Calculate position for text label (text label size scales with bounding box size)
	x_min, y_min = int(min(p[0] for p in bbox)), int(min(p[1] for p in bbox))
	x_max, y_max = int(max(p[0] for p in bbox)), int(max(p[1] for p in bbox))
	bbox_width = x_max - x_min
	bbox_height = y_max - y_min

	# Scale font size based on bounding box dimensions
	font_scale = min(bbox_width, bbox_height) / 100.0
	font_scale = max(0.3, min(font_scale, 1.5)) # Clamp between 0.3 and 1.5
	thickness = max(1, int(font_scale * 2))

	label = f"{text} ({confidence:.2f})"
	(w, h), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, font_scale, thickness)

	# Position text above or below box based on Y position
	text_y = y_min - 5 if y_min > 20 else y_min + 20

	# Draw background rectangle and text
	cv2.rectangle(annotated_image, (x_min - 2, text_y - h - 2), (x_min + w + 2, text_y + 2), (0, 255, 0), -1)
	cv2.putText(annotated_image, label, (x_min, text_y), cv2.FONT_HERSHEY_SIMPLEX, font_scale, (0, 0, 0), thickness)

	# Convert back to RGB and then to PIL Image for Gradio 5.x compatibility
	output_rgb = cv2.cvtColor(annotated_image, cv2.COLOR_BGR2RGB)
	output_pil = Image.fromarray(output_rgb)
	return output_pil, formatted_text or ""


	# Load sample images for demo gallery
	exts = ('.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif')
	sample_images = sorted([str(f) for f in Path('images').iterdir() if f.suffix.lower() in exts])[:3]

	# CSS for professional styling
	css = """
	.gradio-container {font-family: 'Segoe UI', sans-serif; max-width: 1400px; margin: 0 auto; overflow-x: hidden;}
	body, html {overflow-x: hidden; scrollbar-width: none;}
	::-webkit-scrollbar {display: none;}
	h1 {text-align: center; color: #042AFF; margin-bottom: 1rem; font-size: 2.5rem; font-weight: bold; letter-spacing: -0.5px;}
	.description {text-align: center; color: #6b7280; margin-bottom: 0.3rem; font-size: 1.05rem; line-height: 1.6;}
	.credits {text-align: center; color: #f2faf4; margin-bottom: 2rem; margin-top: 0; font-size: 1rem;}
	.credits a {color: #042AFF; text-decoration: none; font-weight: bold; transition: color 0.3s ease;}
	.credits a:hover {color: #111F68; text-decoration: underline;}
	"""

	# Create Gradio interface
	with gr.Blocks(title="OCR Text Detection", theme=gr.themes.Soft(), css=css) as demo:
	gr.Markdown("# 📄 OCR Text Detection")
	gr.Markdown("<div class='description'>Extract text from images with bounding boxes and confidence scores. Upload an image or select a demo image to get started.</div>", elem_classes=["description"])
	gr.Markdown("<div class='credits' style='text-align: center;'>Made by <a href='https://techtics.ai' target='_blank' style='color: #042AFF; text-decoration: none; font-weight: bold;'>Techtics.ai</a></div>", elem_classes=["credits"])

	# Main layout: Two columns
	with gr.Row():
	# Column 1: Upload area with tabs
	with gr.Column(scale=1):
	with gr.Tabs():
	with gr.Tab("Upload File"):
	image_input = gr.Image(label="Upload Image", height=400, show_download_button=True, show_share_button=False, show_fullscreen_button=True)
	with gr.Tab("Image by URL"):
	url_input = gr.Textbox(label="Image URL", placeholder="Enter image URL (jpg, png, etc.)", lines=1)
	url_btn = gr.Button("Load Image from URL", variant="primary")

	# Demo images gallery
	if sample_images:
	gr.Markdown("### Demo Images (Click to load)")
	demo_gallery = gr.Gallery(value=sample_images, columns=3, rows=1, height=210, show_label=False, container=True, preview=False, object_fit="scale-down")

	# Column 2: Processed image and confidence slider
	with gr.Column(scale=1):
	gr.Markdown("### Processed Image")
	annotated_output = gr.Image(label="", height=400, visible=True, show_download_button=True, show_share_button=False, show_fullscreen_button=True)
	confidence_slider = gr.Slider(minimum=0.0, maximum=1.0, value=0.3, step=0.05, label="Confidence Threshold", info="Filter detections by minimum confidence score")

	# Text output below both columns (full width, hidden until processing)
	text_output = gr.Textbox(label="Extracted Text", value="", placeholder="Extracted text will appear here after processing...", lines=12, interactive=True, show_copy_button=True, visible=False)

	# Load image from URL
	def load_from_url(url):
	"""Download and load image from URL."""
	if not url or not url.strip():
	return None
	try:
	req = urllib.request.Request(url.strip(), headers={'User-Agent': 'Mozilla/5.0'})
	with urllib.request.urlopen(req, timeout=10) as response:
	with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as tmp_file:
	tmp_file.write(response.read())
	tmp_path = tmp_file.name
	img = Image.open(tmp_path)
	os.unlink(tmp_path)
	return img
	except Exception:
	return None

	# Load demo image from gallery
	def load_from_gallery(evt: gr.SelectData):
	"""Load demo image when clicked."""
	if evt.index < len(sample_images):
	return Image.open(sample_images[evt.index])
	return None

	# Event handlers
	url_btn.click(fn=load_from_url, inputs=url_input, outputs=image_input)
	url_input.submit(fn=load_from_url, inputs=url_input, outputs=image_input)
	if sample_images:
	demo_gallery.select(fn=load_from_gallery, outputs=image_input)

	# Process image when it changes or confidence slider changes
	def on_change(img, conf_thresh):
	"""Process image and update annotated image + text output."""
	if img is None:
	return None, gr.update(visible=False, value="")
	annot, text = process_ocr(img, conf_thresh)
	return annot, gr.update(visible=True, value=text or "")

	# Process image when it changes - store event to allow cancellation
	process_event = image_input.change(
	fn=on_change,
	inputs=[image_input, confidence_slider],
	outputs=[annotated_output, text_output]
	)

	# Confidence slider cancels previous processing to avoid queue buildup
	confidence_slider.change(
	fn=on_change,
	inputs=[image_input, confidence_slider],
	outputs=[annotated_output, text_output],
	cancels=[process_event] # Cancel previous image processing
	)


	if __name__ == "__main__":
	demo.launch()
	# For local testing, use: demo.launch(share=True, server_name="0.0.0.0", server_port=7860)