59 lines
1.8 KiB
Python
59 lines
1.8 KiB
Python
from pdfminer.pdfparser import PDFParser
|
|
from pdfminer.pdfdocument import PDFDocument
|
|
from pdfminer.pdfpage import PDFPage
|
|
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
|
|
from pdfminer.pdfinterp import PDFResourceManager
|
|
from pdfminer.pdfinterp import PDFPageInterpreter
|
|
from pdfminer.pdfdevice import PDFDevice
|
|
from pdfminer.converter import PDFPageAggregator
|
|
from pdfminer.layout import LAParams, LTChar, LTTextLineHorizontal
|
|
|
|
|
|
def parse_text_basic(layout, t=None):
|
|
if t is None:
|
|
t = []
|
|
try:
|
|
for obj in layout._objs:
|
|
if type(obj) is LTTextLineHorizontal:
|
|
t.append(obj)
|
|
else:
|
|
t += parse_text_basic(obj)
|
|
except AttributeError:
|
|
pass
|
|
return t
|
|
|
|
|
|
def parse_text_spreadsheet(layout, t=None):
|
|
if t is None:
|
|
t = []
|
|
try:
|
|
for obj in layout._objs:
|
|
if type(obj) is LTChar:
|
|
t.append(obj)
|
|
else:
|
|
t += parse_text_spreadsheet(obj)
|
|
except AttributeError:
|
|
pass
|
|
return t
|
|
|
|
|
|
def get_pdf_info(pdfname, method):
|
|
with open(pdfname, 'r') as f:
|
|
parser = PDFParser(f)
|
|
document = PDFDocument(parser)
|
|
if not document.is_extractable:
|
|
raise PDFTextExtractionNotAllowed
|
|
laparams = LAParams()
|
|
rsrcmgr = PDFResourceManager()
|
|
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
|
|
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
|
for page in PDFPage.create_pages(document):
|
|
interpreter.process_page(page)
|
|
layout = device.get_result()
|
|
if method == 'basic':
|
|
text = parse_text_basic(layout)
|
|
elif method == 'spreadsheet':
|
|
text = parse_text_spreadsheet(layout)
|
|
pdf_x, pdf_y = layout.bbox[2], layout.bbox[3]
|
|
return text, pdf_x, pdf_y
|