Spaces:

hellorahulk
/

docling_free

Running

App Files Files Community

hellorahulk commited on Jan 23

Commit

ec3f76a

1 Parent(s): fdbfd73

Add URL input support for document processing

Browse files

Files changed (2) hide show

app.py +77 -13
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -6,17 +6,20 @@ from dockling_parser.exceptions import ParserError, UnsupportedFormatError
 import tempfile
 import mimetypes
 import traceback
 TITLE = "📄 Smart Document Parser"
 DESCRIPTION = """
 A powerful document parsing application that automatically extracts structured information from various document formats.
-Upload any document (PDF, DOCX, TXT, HTML, Markdown) and get structured information extracted automatically.
 """
 ARTICLE = """
 ## 🚀 Features
 - Multiple Format Support: PDF, DOCX, TXT, HTML, and Markdown
 - Rich Information Extraction
 - Smart Processing with Confidence Scoring
 - Automatic Format Detection
@@ -25,16 +28,30 @@ Made with ❤️ using Docling and Gradio
 """
 ERROR_MESSAGES = {
-    "no_file": (
-        "⚠️ No file uploaded",
-        "Please upload a document to process.",
         "No sections available",
         "No entities available",
         "Confidence Score: 0.0"
     ),
     "unsupported_format": (
         "⚠️ Unsupported file format",
-        "Please upload a file in one of the supported formats: PDF, DOCX, TXT, HTML, or MD.",
         "No sections available",
         "No entities available",
         "Confidence Score: 0.0"
@@ -51,14 +68,45 @@ ERROR_MESSAGES = {
 # Initialize the document parser
 parser = DocumentParser()
-def process_document(file_path):
-    """Process uploaded document and return structured information"""
-    if file_path is None:
-        return ERROR_MESSAGES["no_file"]
     try:
-        # Parse the document directly using the file path
-        result = parser.parse(file_path)
         # Prepare the outputs
         metadata_df = pd.DataFrame([{
@@ -110,6 +158,13 @@ def process_document(file_path):
             "No entities available",
             "Confidence Score: 0.0"
         )
 # Create Gradio interface
 with gr.Blocks(title=TITLE, theme=gr.themes.Soft()) as iface:
@@ -123,6 +178,10 @@ with gr.Blocks(title=TITLE, theme=gr.themes.Soft()) as iface:
                 file_types=[".pdf", ".docx", ".txt", ".html", ".md"],
                 type="filepath"
             )
             submit_btn = gr.Button("Process Document", variant="primary")
         with gr.Column():
@@ -158,8 +217,8 @@ with gr.Blocks(title=TITLE, theme=gr.themes.Soft()) as iface:
     # Handle file submission
     submit_btn.click(
-        fn=process_document,
-        inputs=[file_input],
         outputs=[
             content_output,
             metadata_output,
@@ -176,6 +235,11 @@ with gr.Blocks(title=TITLE, theme=gr.themes.Soft()) as iface:
     - Text Files (*.txt)
     - HTML Files (*.html)
     - Markdown Files (*.md)
     """)
     gr.Markdown(ARTICLE)

 import tempfile
 import mimetypes
 import traceback
+import requests
+from urllib.parse import urlparse
 TITLE = "📄 Smart Document Parser"
 DESCRIPTION = """
 A powerful document parsing application that automatically extracts structured information from various document formats.
+Upload a document or provide a URL (PDF, DOCX, TXT, HTML, Markdown) and get structured information automatically.
 """
 ARTICLE = """
 ## 🚀 Features
 - Multiple Format Support: PDF, DOCX, TXT, HTML, and Markdown
+- Support for File Upload and URLs
 - Rich Information Extraction
 - Smart Processing with Confidence Scoring
 - Automatic Format Detection
 """
 ERROR_MESSAGES = {
+    "no_input": (
+        "⚠️ No input provided",
+        "Please upload a document or provide a URL.",
+        "No sections available",
+        "No entities available",
+        "Confidence Score: 0.0"
+    ),
+    "invalid_url": (
+        "⚠️ Invalid URL",
+        "Please provide a valid URL to a document.",
+        "No sections available",
+        "No entities available",
+        "Confidence Score: 0.0"
+    ),
+    "download_error": (
+        "⚠️ Failed to download document",
+        "Could not download the document from the provided URL.",
         "No sections available",
         "No entities available",
         "Confidence Score: 0.0"
     ),
     "unsupported_format": (
         "⚠️ Unsupported file format",
+        "Please provide a file in one of the supported formats: PDF, DOCX, TXT, HTML, or MD.",
         "No sections available",
         "No entities available",
         "Confidence Score: 0.0"
 # Initialize the document parser
 parser = DocumentParser()
+def download_file(url: str) -> str:
+    """Download file from URL and save to temporary file"""
+    try:
+        # Extract filename from URL
+        parsed_url = urlparse(url)
+        filename = os.path.basename(parsed_url.path)
+        if not filename:
+            filename = "document.pdf"  # Default filename
+        # Download file
+        response = requests.get(url, allow_redirects=True)
+        response.raise_for_status()
+        # Save to temporary file
+        with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(filename)[1]) as tmp_file:
+            tmp_file.write(response.content)
+            return tmp_file.name
+    except Exception as e:
+        raise Exception(f"Failed to download file: {str(e)}")
+def process_input(file_input, url_input):
+    """Process either uploaded file or URL input"""
+    # Check if we have any input
+    if file_input is None and not url_input:
+        return ERROR_MESSAGES["no_input"]
+    temp_file = None
     try:
+        # Handle URL input if provided
+        if url_input:
+            try:
+                temp_file = download_file(url_input)
+                result = parser.parse(temp_file)
+            except Exception as e:
+                return ERROR_MESSAGES["download_error"]
+        # Handle file upload
+        else:
+            result = parser.parse(file_input)
         # Prepare the outputs
         metadata_df = pd.DataFrame([{
             "No entities available",
             "Confidence Score: 0.0"
         )
+    finally:
+        # Cleanup temporary file if it was created
+        if temp_file and os.path.exists(temp_file):
+            try:
+                os.unlink(temp_file)
+            except:
+                pass
 # Create Gradio interface
 with gr.Blocks(title=TITLE, theme=gr.themes.Soft()) as iface:
                 file_types=[".pdf", ".docx", ".txt", ".html", ".md"],
                 type="filepath"
             )
+            url_input = gr.Textbox(
+                label="Or Enter Document URL",
+                placeholder="https://example.com/document.pdf"
+            )
             submit_btn = gr.Button("Process Document", variant="primary")
         with gr.Column():
     # Handle file submission
     submit_btn.click(
+        fn=process_input,
+        inputs=[file_input, url_input],
         outputs=[
             content_output,
             metadata_output,
     - Text Files (*.txt)
     - HTML Files (*.html)
     - Markdown Files (*.md)
+    ### 🔗 Example URLs
+    - ArXiv PDFs: https://arxiv.org/pdf/2408.08921.pdf
+    - Research Papers
+    - Documentation
     """)
     gr.Markdown(ARTICLE)

requirements.txt CHANGED Viewed

@@ -9,4 +9,5 @@ gradio>=4.44.1
 pandas>=1.5.0
 huggingface-hub>=0.19.0
 python-magic-bin>=0.4.14; platform_system == "Windows"
-libmagic; platform_system == "Linux"

 pandas>=1.5.0
 huggingface-hub>=0.19.0
 python-magic-bin>=0.4.14; platform_system == "Windows"
+libmagic; platform_system == "Linux"
+requests>=2.31.0