KhacHuy commited on
Commit
0d4ebe0
·
verified ·
1 Parent(s): 530c6f1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +157 -103
app.py CHANGED
@@ -1,149 +1,203 @@
1
  import gradio as gr
2
  import torch
3
- from transformers import AutoTokenizer, AutoModelForCausalLM
4
  import os
5
  import tempfile
6
  from PIL import Image, ImageDraw
7
  import re
8
 
9
- # -----------------------------------------
10
- # Force CPU Mode
11
- # -----------------------------------------
12
- device = torch.device("cpu")
13
- torch.set_default_device(device)
14
-
15
- # -----------------------------------------
16
- # 1. Load model ONCE at startup (CPU)
17
- # -----------------------------------------
18
- print("🔄 Loading model and tokenizer...")
19
  model_name = "deepseek-ai/DeepSeek-OCR"
20
-
21
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
22
 
23
- model = AutoModelForCausalLM.from_pretrained(
 
24
  model_name,
25
  trust_remote_code=True,
26
- torch_dtype=torch.float32,
27
- device_map={"": "cpu"}
28
  )
 
 
29
 
30
- model.eval()
31
- print("✅ Model loaded successfully (CPU mode)!")
32
-
33
- # -----------------------------------------
34
- # Helper: find generated result images
35
- # -----------------------------------------
36
  def find_result_image(path):
37
  for filename in os.listdir(path):
38
  if "grounding" in filename or "result" in filename:
39
  try:
40
- return Image.open(os.path.join(path, filename))
41
- except:
42
- continue
 
43
  return None
44
 
45
- # -----------------------------------------
46
- # 2. OCR main function
47
- # -----------------------------------------
48
  def process_ocr_task(image, model_size, task_type, ref_text):
49
-
 
 
 
50
  if image is None:
51
- return "Please upload image first.", None
52
-
53
- print("⚙️ Running OCR (CPU mode)...")
54
-
55
- # Prompt logic
56
- if task_type == "📝 Free OCR":
57
- prompt = "<image>\nFree OCR."
58
- elif task_type == "📄 Convert to Markdown":
59
- prompt = "<image>\n<|grounding|>Convert document to markdown."
60
- elif task_type == "📈 Parse Figure":
61
- prompt = "<image>\nParse the figure."
62
- elif task_type == "🔍 Locate Object by Reference":
63
- if not ref_text.strip():
64
- raise gr.Error("Reference text required!")
65
- prompt = f"<image>\nLocate <|ref|>{ref_text.strip()}<|/ref|> in the image."
66
- else:
67
- prompt = "<image>\nFree OCR."
68
-
69
- # Size configs
70
- size_configs = {
71
- "Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
72
- "Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
73
- "Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
74
- "Large": {"base_size": 1280, "image_size": 1280, "crop_mode": False},
75
- "Gundam (Recommended)": {"base_size": 1024, "image_size": 640, "crop_mode": True},
76
- }
77
- config = size_configs[model_size]
78
-
79
- # Temporary image save
80
- with tempfile.TemporaryDirectory() as output_path:
81
- img_path = os.path.join(output_path, "input.png")
82
- image.save(img_path)
83
 
84
- # Corrected infer API for DeepSeek-OCR
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  text_result = model.infer(
86
- img=image,
87
  prompt=prompt,
88
- output_dir=output_path,
 
89
  base_size=config["base_size"],
90
  image_size=config["image_size"],
91
- crop_mode=config["crop_mode"]
 
 
 
92
  )
93
 
94
- print("📜 Output text:", text_result[:200])
95
 
96
- # Detect bounding boxes if exist
 
 
 
97
  pattern = re.compile(r"<\|det\|>\[\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]\]<\|/det\|>")
98
  matches = list(pattern.finditer(text_result))
99
 
100
  if matches:
101
- result_img = image.copy()
102
- draw = ImageDraw.Draw(result_img)
 
 
 
103
  w, h = image.size
104
 
105
- for m in matches:
106
- x1n, y1n, x2n, y2n = map(int, m.groups())
107
- draw.rectangle([
108
- int(x1n / 1000 * w),
109
- int(y1n / 1000 * h),
110
- int(x2n / 1000 * w),
111
- int(y2n / 1000 * h),
112
- ], outline="red", width=3)
113
-
114
- return text_result, result_img
115
-
116
- return text_result, find_result_image(output_path)
117
-
118
- # -----------------------------------------
119
- # 3. UI Layout
120
- # -----------------------------------------
121
- with gr.Blocks(title="🐳DeepSeek-OCR🐳", theme=gr.themes.Soft()) as demo:
122
- gr.Markdown("## DeepSeek-OCR Demo - CPU Mode")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
 
124
  with gr.Row():
125
  with gr.Column(scale=1):
126
- image_input = gr.Image(type="pil", label="Upload Image")
127
  model_size = gr.Dropdown(
128
- ["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"],
129
- value="Gundam (Recommended)"
 
130
  )
131
  task_type = gr.Dropdown(
132
- ["📝 Free OCR", "📄 Convert to Markdown", "📈 Parse Figure", "🔍 Locate Object by Reference"],
133
- value="📄 Convert to Markdown"
 
 
 
 
 
 
134
  )
135
- ref_text = gr.Textbox(visible=False)
136
- btn = gr.Button("🚀 Process")
137
 
138
  with gr.Column(scale=2):
139
- out_text = gr.Textbox(lines=12, show_copy_button=True)
140
- out_image = gr.Image(type="pil", label="Result")
141
-
142
- def toggle(t):
143
- return gr.Textbox(visible=(t == "🔍 Locate Object by Reference"))
144
-
145
- task_type.change(toggle, task_type, ref_text)
146
- btn.click(process_ocr_task, [image_input, model_size, task_type, ref_text], [out_text, out_image])
147
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  if __name__ == "__main__":
149
- demo.queue().launch(server_name="0.0.0.0", server_port=7860)
 
 
 
 
1
  import gradio as gr
2
  import torch
3
+ from transformers import AutoModel, AutoTokenizer
4
  import os
5
  import tempfile
6
  from PIL import Image, ImageDraw
7
  import re
8
 
9
+ # --- 1. Load Model and Tokenizer (CPU only) ---
10
+ print("Loading model and tokenizer on CPU...")
 
 
 
 
 
 
 
 
11
  model_name = "deepseek-ai/DeepSeek-OCR"
 
12
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
13
 
14
+ # Load model directly to CPU without flash_attention_2 (GPU-only feature)
15
+ model = AutoModel.from_pretrained(
16
  model_name,
17
  trust_remote_code=True,
18
+ use_safetensors=True,
19
+ torch_dtype=torch.float32 # Use float32 for CPU
20
  )
21
+ model = model.eval()
22
+ print("✅ Model loaded successfully on CPU.")
23
 
24
+ # --- Helper function to find pre-generated result images ---
 
 
 
 
 
25
  def find_result_image(path):
26
  for filename in os.listdir(path):
27
  if "grounding" in filename or "result" in filename:
28
  try:
29
+ image_path = os.path.join(path, filename)
30
+ return Image.open(image_path)
31
+ except Exception as e:
32
+ print(f"Error opening result image {filename}: {e}")
33
  return None
34
 
35
+ # --- 2. Main Processing Function (CPU version) ---
 
 
36
  def process_ocr_task(image, model_size, task_type, ref_text):
37
+ """
38
+ Processes an image with DeepSeek-OCR for all supported tasks.
39
+ CPU-only version without GPU decorators.
40
+ """
41
  if image is None:
42
+ return "Please upload an image first.", None
43
+
44
+ print("🚀 Processing on CPU...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
+ with tempfile.TemporaryDirectory() as output_path:
47
+ # Build the prompt
48
+ if task_type == "📝 Free OCR":
49
+ prompt = "<image>\nFree OCR."
50
+ elif task_type == "📄 Convert to Markdown":
51
+ prompt = "<image>\n<|grounding|>Convert the document to markdown."
52
+ elif task_type == "📈 Parse Figure":
53
+ prompt = "<image>\nParse the figure."
54
+ elif task_type == "🔍 Locate Object by Reference":
55
+ if not ref_text or ref_text.strip() == "":
56
+ raise gr.Error("For the 'Locate' task, you must provide the reference text to find!")
57
+ prompt = f"<image>\nLocate <|ref|>{ref_text.strip()}<|/ref|> in the image."
58
+ else:
59
+ prompt = "<image>\nFree OCR."
60
+
61
+ temp_image_path = os.path.join(output_path, "temp_image.png")
62
+ image.save(temp_image_path)
63
+
64
+ # Configure model size
65
+ size_configs = {
66
+ "Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
67
+ "Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
68
+ "Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
69
+ "Large": {"base_size": 1280, "image_size": 1280, "crop_mode": False},
70
+ "Gundam (Recommended)": {"base_size": 1024, "image_size": 640, "crop_mode": True},
71
+ }
72
+ config = size_configs.get(model_size, size_configs["Gundam (Recommended)"])
73
+
74
+ print(f"🏃 Running inference with prompt: {prompt}")
75
+
76
+ # Run inference on CPU (model is already on CPU)
77
  text_result = model.infer(
78
+ tokenizer,
79
  prompt=prompt,
80
+ image_file=temp_image_path,
81
+ output_path=output_path,
82
  base_size=config["base_size"],
83
  image_size=config["image_size"],
84
+ crop_mode=config["crop_mode"],
85
+ save_results=True,
86
+ test_compress=True,
87
+ eval_mode=True,
88
  )
89
 
90
+ print(f"====\n📄 Text Result: {text_result}\n====")
91
 
92
+ # Try to find and draw all bounding boxes
93
+ result_image_pil = None
94
+
95
+ # Pattern to find coordinates like [[280, 15, 696, 997]]
96
  pattern = re.compile(r"<\|det\|>\[\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]\]<\|/det\|>")
97
  matches = list(pattern.finditer(text_result))
98
 
99
  if matches:
100
+ print(f"✅ Found {len(matches)} bounding box(es). Drawing on the original image.")
101
+
102
+ # Create a copy of the original image to draw on
103
+ image_with_bboxes = image.copy()
104
+ draw = ImageDraw.Draw(image_with_bboxes)
105
  w, h = image.size
106
 
107
+ for match in matches:
108
+ # Extract coordinates as integers
109
+ coords_norm = [int(c) for c in match.groups()]
110
+ x1_norm, y1_norm, x2_norm, y2_norm = coords_norm
111
+
112
+ # Scale normalized coordinates to actual image size
113
+ x1 = int(x1_norm / 1000 * w)
114
+ y1 = int(y1_norm / 1000 * h)
115
+ x2 = int(x2_norm / 1000 * w)
116
+ y2 = int(y2_norm / 1000 * h)
117
+
118
+ # Draw rectangle with red outline
119
+ draw.rectangle([x1, y1, x2, y2], outline="red", width=3)
120
+
121
+ result_image_pil = image_with_bboxes
122
+ else:
123
+ print("⚠️ No bounding box coordinates found. Falling back to search for result image file.")
124
+ result_image_pil = find_result_image(output_path)
125
+
126
+ return text_result, result_image_pil
127
+
128
+
129
+ # --- 3. Build the Gradio Interface ---
130
+ with gr.Blocks(title="🐳DeepSeek-OCR (CPU)🐳", theme=gr.themes.Soft()) as demo:
131
+ gr.Markdown(
132
+ """
133
+ # 🐳 DeepSeek-OCR (CPU Version) 🐳
134
+ **⚠️ Note: Running on CPU - processing will be slower than GPU version**
135
+
136
+ **💡 How to use:**
137
+ 1. **Upload an image** using the upload box.
138
+ 2. Select a **Resolution**. Start with `Tiny` or `Small` for faster CPU processing.
139
+ 3. Choose a **Task Type**:
140
+ - **📝 Free OCR**: Extracts raw text from the image.
141
+ - **📄 Convert to Markdown**: Converts the document into Markdown.
142
+ - **📈 Parse Figure**: Extracts structured data from charts.
143
+ - **🔍 Locate Object by Reference**: Finds a specific object/text.
144
+ 4. If this helpful, please give it a like! 🙏 ❤️
145
+ """
146
+ )
147
 
148
  with gr.Row():
149
  with gr.Column(scale=1):
150
+ image_input = gr.Image(type="pil", label="🖼️ Upload Image", sources=["upload", "clipboard"])
151
  model_size = gr.Dropdown(
152
+ choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"],
153
+ value="Small", # Default to Small for faster CPU processing
154
+ label="⚙️ Resolution Size"
155
  )
156
  task_type = gr.Dropdown(
157
+ choices=["📝 Free OCR", "📄 Convert to Markdown", "📈 Parse Figure", "🔍 Locate Object by Reference"],
158
+ value="📄 Convert to Markdown",
159
+ label="🚀 Task Type"
160
+ )
161
+ ref_text_input = gr.Textbox(
162
+ label="📝 Reference Text (for Locate task)",
163
+ placeholder="e.g., the teacher, 20-10, a red car...",
164
+ visible=False
165
  )
166
+ submit_btn = gr.Button("Process Image", variant="primary")
 
167
 
168
  with gr.Column(scale=2):
169
+ output_text = gr.Textbox(label="📄 Text Result", lines=15, show_copy_button=True)
170
+ output_image = gr.Image(label="🖼️ Image Result (if any)", type="pil")
171
+
172
+ # UI Interaction Logic
173
+ def toggle_ref_text_visibility(task):
174
+ return gr.Textbox(visible=True) if task == "🔍 Locate Object by Reference" else gr.Textbox(visible=False)
175
+
176
+ task_type.change(fn=toggle_ref_text_visibility, inputs=task_type, outputs=ref_text_input)
177
+ submit_btn.click(
178
+ fn=process_ocr_task,
179
+ inputs=[image_input, model_size, task_type, ref_text_input],
180
+ outputs=[output_text, output_image]
181
+ )
182
+
183
+ # Examples
184
+ gr.Examples(
185
+ examples=[
186
+ ["doc_markdown.png", "Small", "📄 Convert to Markdown", ""],
187
+ ["chart.png", "Small", "📈 Parse Figure", ""],
188
+ ["teacher.jpg", "Tiny", "🔍 Locate Object by Reference", "the teacher"],
189
+ ["math_locate.jpg", "Tiny", "🔍 Locate Object by Reference", "20-10"],
190
+ ["receipt.jpg", "Small", "📝 Free OCR", ""],
191
+ ],
192
+ inputs=[image_input, model_size, task_type, ref_text_input],
193
+ outputs=[output_text, output_image],
194
+ fn=process_ocr_task,
195
+ cache_examples=False,
196
+ )
197
+
198
+ # --- 4. Launch the App ---
199
  if __name__ == "__main__":
200
+ if not os.path.exists("examples"):
201
+ os.makedirs("examples")
202
+
203
+ demo.queue(max_size=5).launch(share=True) # Reduced queue size for CPU