Spaces:

Mi-Ni
/

PDFtoAudio

Runtime error

App Files Files Community

Mi-Ni commited on Dec 9, 2023

Commit

97a3fab

1 Parent(s): 6d55c1a

Update app.py

Browse files

Files changed (1) hide show

app.py +66 -20

app.py CHANGED Viewed

@@ -37,26 +37,72 @@ def text_extraction(element):
     return (line_text, format_per_line)
 def read_pdf(pdf_path):
-    text_per_page = {}
-    # Open the PDF file
-    pdf_doc = fitz.open(pdf_path)
-    # Iterate through each page
-    for page_number in range(pdf_doc.page_count):
-        # Get the page
-        page = pdf_doc.load_page(page_number)
-        # Extract text from the page
-        text = page.get_text("text")
-        # Store the text in the dictionary
-        text_per_page[f"Page_{page_number}"] = text
-    # Close the PDF file
-    pdf_doc.close()
-    return text_per_page
 #pdf_path = 'Article 11 Hidden Technical Debt in Machine Learning Systems'
 pdf_path = gr.File()

     return (line_text, format_per_line)
 def read_pdf(pdf_path):
+  # create a PDF file object
+  pdfFileObj = open(pdf_path, 'rb')
+  # create a PDF reader object
+  pdfReaded = PyPDF2.PdfReader(pdfFileObj)
+  #pdfReaded = PdfReader(pdfFileObj)
+  # Create the dictionary to extract text from each image
+  text_per_page = {}
+  # We extract the pages from the PDF
+  for pagenum, page in enumerate(extract_pages(pdf_path)):
+      print("Elaborating Page_" +str(pagenum))
+      # Initialize the variables needed for the text extraction from the page
+      pageObj = pdfReaded.pages[pagenum]
+      page_text = []
+      line_format = []
+      text_from_images = []
+      text_from_tables = []
+      page_content = []
+      # Initialize the number of the examined tables
+      table_num = 0
+      first_element= True
+      table_extraction_flag= False
+      # Open the pdf file
+      pdf = pdfplumber.open(pdf_path)
+      # Find the examined page
+      page_tables = pdf.pages[pagenum]
+      # Find the number of tables on the page
+      tables = page_tables.find_tables()
+      # Find all the elements
+      page_elements = [(element.y1, element) for element in page._objs]
+      # Sort all the elements as they appear in the page
+      page_elements.sort(key=lambda a: a[0], reverse=True)
+      # Find the elements that composed a page
+      for i,component in enumerate(page_elements):
+          # Extract the position of the top side of the element in the PDF
+          pos= component[0]
+          # Extract the element of the page layout
+          element = component[1]
+          # Check if the element is a text element
+          if isinstance(element, LTTextContainer):
+              # Check if the text appeared in a table
+              if table_extraction_flag == False:
+                  # Use the function to extract the text and format for each text element
+                  (line_text, format_per_line) = text_extraction(element)
+                  # Append the text of each line to the page text
+                  page_text.append(line_text)
+                  # Append the format for each line containing text
+                  line_format.append(format_per_line)
+                  page_content.append(line_text)
+              else:
+                  # Omit the text that appeared in a table
+                  pass
+      # Create the key of the dictionary
+      dctkey = 'Page_'+str(pagenum)
+      # Add the list of list as the value of the page key
+      text_per_page[dctkey]= [page_text, line_format, text_from_images,text_from_tables, page_content]
+  # Closing the pdf file object
+  pdfFileObj.close()
+  return text_per_page
 #pdf_path = 'Article 11 Hidden Technical Debt in Machine Learning Systems'
 pdf_path = gr.File()