Mi-Ni commited on
Commit
97a3fab
·
1 Parent(s): 6d55c1a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +66 -20
app.py CHANGED
@@ -37,26 +37,72 @@ def text_extraction(element):
37
  return (line_text, format_per_line)
38
 
39
  def read_pdf(pdf_path):
40
- text_per_page = {}
41
-
42
- # Open the PDF file
43
- pdf_doc = fitz.open(pdf_path)
44
-
45
- # Iterate through each page
46
- for page_number in range(pdf_doc.page_count):
47
- # Get the page
48
- page = pdf_doc.load_page(page_number)
49
-
50
- # Extract text from the page
51
- text = page.get_text("text")
52
-
53
- # Store the text in the dictionary
54
- text_per_page[f"Page_{page_number}"] = text
55
-
56
- # Close the PDF file
57
- pdf_doc.close()
58
-
59
- return text_per_page
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
  #pdf_path = 'Article 11 Hidden Technical Debt in Machine Learning Systems'
62
  pdf_path = gr.File()
 
37
  return (line_text, format_per_line)
38
 
39
  def read_pdf(pdf_path):
40
+ # create a PDF file object
41
+ pdfFileObj = open(pdf_path, 'rb')
42
+ # create a PDF reader object
43
+ pdfReaded = PyPDF2.PdfReader(pdfFileObj)
44
+ #pdfReaded = PdfReader(pdfFileObj)
45
+ # Create the dictionary to extract text from each image
46
+ text_per_page = {}
47
+ # We extract the pages from the PDF
48
+ for pagenum, page in enumerate(extract_pages(pdf_path)):
49
+ print("Elaborating Page_" +str(pagenum))
50
+ # Initialize the variables needed for the text extraction from the page
51
+ pageObj = pdfReaded.pages[pagenum]
52
+ page_text = []
53
+ line_format = []
54
+ text_from_images = []
55
+ text_from_tables = []
56
+ page_content = []
57
+ # Initialize the number of the examined tables
58
+ table_num = 0
59
+ first_element= True
60
+ table_extraction_flag= False
61
+ # Open the pdf file
62
+ pdf = pdfplumber.open(pdf_path)
63
+ # Find the examined page
64
+ page_tables = pdf.pages[pagenum]
65
+ # Find the number of tables on the page
66
+ tables = page_tables.find_tables()
67
+
68
+
69
+ # Find all the elements
70
+ page_elements = [(element.y1, element) for element in page._objs]
71
+ # Sort all the elements as they appear in the page
72
+ page_elements.sort(key=lambda a: a[0], reverse=True)
73
+
74
+ # Find the elements that composed a page
75
+ for i,component in enumerate(page_elements):
76
+ # Extract the position of the top side of the element in the PDF
77
+ pos= component[0]
78
+ # Extract the element of the page layout
79
+ element = component[1]
80
+
81
+ # Check if the element is a text element
82
+ if isinstance(element, LTTextContainer):
83
+ # Check if the text appeared in a table
84
+ if table_extraction_flag == False:
85
+ # Use the function to extract the text and format for each text element
86
+ (line_text, format_per_line) = text_extraction(element)
87
+ # Append the text of each line to the page text
88
+ page_text.append(line_text)
89
+ # Append the format for each line containing text
90
+ line_format.append(format_per_line)
91
+ page_content.append(line_text)
92
+ else:
93
+ # Omit the text that appeared in a table
94
+ pass
95
+
96
+
97
+ # Create the key of the dictionary
98
+ dctkey = 'Page_'+str(pagenum)
99
+ # Add the list of list as the value of the page key
100
+ text_per_page[dctkey]= [page_text, line_format, text_from_images,text_from_tables, page_content]
101
+
102
+ # Closing the pdf file object
103
+ pdfFileObj.close()
104
+
105
+ return text_per_page
106
 
107
  #pdf_path = 'Article 11 Hidden Technical Debt in Machine Learning Systems'
108
  pdf_path = gr.File()