Spaces:
Running
Running
Commit
·
cca0a5d
1
Parent(s):
070e4b3
Fix file handling with filepath type and better error handling
Browse files
app.py
CHANGED
|
@@ -26,41 +26,35 @@ Made with ❤️ using Docling and Gradio
|
|
| 26 |
# Initialize the document parser
|
| 27 |
parser = DocumentParser()
|
| 28 |
|
| 29 |
-
def get_file_extension(file_type):
|
| 30 |
-
"""Get file extension based on MIME type"""
|
| 31 |
-
extensions = {
|
| 32 |
-
'application/pdf': '.pdf',
|
| 33 |
-
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
|
| 34 |
-
'text/plain': '.txt',
|
| 35 |
-
'text/html': '.html',
|
| 36 |
-
'text/markdown': '.md'
|
| 37 |
-
}
|
| 38 |
-
return extensions.get(file_type, '.tmp')
|
| 39 |
-
|
| 40 |
def process_document(file_obj):
|
| 41 |
"""Process uploaded document and return structured information"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
temp_path = None
|
| 43 |
try:
|
| 44 |
-
#
|
| 45 |
-
if
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
else:
|
| 52 |
-
# Handle binary data directly
|
| 53 |
-
file_data = file_obj
|
| 54 |
-
extension = '.pdf' # Default to PDF for binary uploads
|
| 55 |
-
|
| 56 |
-
# Create temporary file
|
| 57 |
with tempfile.NamedTemporaryFile(delete=False, suffix=extension) as tmp_file:
|
| 58 |
-
|
| 59 |
-
|
|
|
|
|
|
|
| 60 |
else:
|
| 61 |
-
tmp_file.write(
|
| 62 |
temp_path = tmp_file.name
|
| 63 |
-
|
| 64 |
# Parse the document
|
| 65 |
result = parser.parse(temp_path)
|
| 66 |
|
|
@@ -121,7 +115,7 @@ with gr.Blocks(title=TITLE, theme=gr.themes.Soft()) as iface:
|
|
| 121 |
file_input = gr.File(
|
| 122 |
label="Upload Document",
|
| 123 |
file_types=[".pdf", ".docx", ".txt", ".html", ".md"],
|
| 124 |
-
type="binary
|
| 125 |
)
|
| 126 |
submit_btn = gr.Button("Process Document", variant="primary")
|
| 127 |
|
|
|
|
| 26 |
# Initialize the document parser
|
| 27 |
parser = DocumentParser()
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
def process_document(file_obj):
|
| 30 |
"""Process uploaded document and return structured information"""
|
| 31 |
+
if file_obj is None:
|
| 32 |
+
return (
|
| 33 |
+
"Error: No file uploaded",
|
| 34 |
+
pd.DataFrame(),
|
| 35 |
+
"No sections available",
|
| 36 |
+
"No entities available",
|
| 37 |
+
"Confidence Score: 0.0"
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
temp_path = None
|
| 41 |
try:
|
| 42 |
+
# Create temporary file with appropriate extension
|
| 43 |
+
original_filename = file_obj.name if hasattr(file_obj, 'name') else "uploaded_file.pdf"
|
| 44 |
+
extension = os.path.splitext(original_filename)[1].lower()
|
| 45 |
+
if not extension:
|
| 46 |
+
extension = '.pdf' # Default to PDF if no extension
|
| 47 |
+
|
| 48 |
+
# Create temporary file and write content
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
with tempfile.NamedTemporaryFile(delete=False, suffix=extension) as tmp_file:
|
| 50 |
+
# Write the content
|
| 51 |
+
content = file_obj.read() if hasattr(file_obj, 'read') else file_obj
|
| 52 |
+
if isinstance(content, bytes):
|
| 53 |
+
tmp_file.write(content)
|
| 54 |
else:
|
| 55 |
+
tmp_file.write(content.encode('utf-8'))
|
| 56 |
temp_path = tmp_file.name
|
| 57 |
+
|
| 58 |
# Parse the document
|
| 59 |
result = parser.parse(temp_path)
|
| 60 |
|
|
|
|
| 115 |
file_input = gr.File(
|
| 116 |
label="Upload Document",
|
| 117 |
file_types=[".pdf", ".docx", ".txt", ".html", ".md"],
|
| 118 |
+
type="filepath" # Changed from binary to filepath
|
| 119 |
)
|
| 120 |
submit_btn = gr.Button("Process Document", variant="primary")
|
| 121 |
|