Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| from doctr.models import ocr_predictor | |
| import numpy as np | |
| model = ocr_predictor(det_arch='db_resnet50', reco_arch='crnn_vgg16_bn', pretrained=True) | |
| def extract_word_ready_table(image): | |
| if image is None: | |
| return "Please upload an image." | |
| img_array = np.array(image) | |
| result = model([img_array]) | |
| json_export = result.export() | |
| markdown_rows = [] | |
| # Boundaries for Name | Code | Statement | Group | Sub-Group | Normally | |
| col_bounds = [0.28, 0.35, 0.48, 0.62, 0.88] | |
| for page in json_export['pages']: | |
| words_list = [] | |
| for block in page['blocks']: | |
| for line in block['lines']: | |
| for word in line['words']: | |
| y_top = word['geometry'][0][1] | |
| y_bot = word['geometry'][1][1] | |
| x_mid = (word['geometry'][0][0] + word['geometry'][1][0]) / 2 | |
| words_list.append({ | |
| 'text': word['value'], | |
| 'y_top': y_top, | |
| 'y_bot': y_bot, | |
| 'y_mid': (y_top + y_bot) / 2, | |
| 'x_mid': x_mid | |
| }) | |
| if not words_list: continue | |
| words_list.sort(key=lambda w: w['y_mid']) | |
| # 1. Smarter Row Grouping: We use a larger threshold (0.02) | |
| # to catch text that is slightly above or below the main line | |
| rows = [] | |
| current_row = [words_list[0]] | |
| for i in range(1, len(words_list)): | |
| # If word overlaps vertically with the current row, it's the SAME row | |
| if words_list[i]['y_top'] < current_row[-1]['y_bot'] + 0.01: | |
| current_row.append(words_list[i]) | |
| else: | |
| rows.append(current_row) | |
| current_row = [words_list[i]] | |
| rows.append(current_row) | |
| # 2. Build the line | |
| for row in rows: | |
| slots = ["", "", "", "", "", ""] | |
| for w in row: | |
| x = w['x_mid'] | |
| t = w['text'] | |
| if x < col_bounds[0]: slots[0] += t + " " | |
| elif x < col_bounds[1]: slots[1] += t + " " | |
| elif x < col_bounds[2]: slots[2] += t + " " | |
| elif x < col_bounds[3]: slots[3] += t + " " | |
| elif x < col_bounds[4]: slots[4] += t + " " | |
| else: slots[5] += t + " " | |
| clean_slots = [s.strip() for s in slots] | |
| if any(clean_slots): | |
| # We use the Pipe (|) as the only separator | |
| markdown_rows.append("| " + " | ".join(clean_slots) + " |") | |
| return "\n".join(markdown_rows) | |
| with gr.Blocks() as demo: | |
| gr.Markdown("## π Word-Ready Accountancy Extractor") | |
| gr.Markdown("Forces wrapped text into a single line to prevent Word from merging cells incorrectly.") | |
| with gr.Row(): | |
| with gr.Column(): | |
| img_in = gr.Image(type="pil") | |
| btn = gr.Button("Extract for Word", variant="primary") | |
| with gr.Column(): | |
| out = gr.Textbox(label="Result (One Line Per Row)", lines=25, elem_id="out_box") | |
| copy_btn = gr.Button("π Copy Table") | |
| copy_btn.click(None, None, None, js=""" | |
| () => { | |
| const text = document.querySelector('#output-text textarea').value; | |
| navigator.clipboard.writeText(text); | |
| alert('Copied! Now use Insert > Table > Convert Text to Table in Word.'); | |
| } | |
| """) | |
| btn.click(extract_word_ready_table, inputs=img_in, outputs=out) | |
| demo.launch() |