112 lines
2.7 KiB
Python
112 lines
2.7 KiB
Python
from pdfminer.pdfparser import PDFParser
|
|
from pdfminer.pdfdocument import PDFDocument
|
|
from pdfminer.pdfpage import PDFPage
|
|
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
|
|
from pdfminer.pdfinterp import PDFResourceManager
|
|
from pdfminer.pdfinterp import PDFPageInterpreter
|
|
from pdfminer.pdfdevice import PDFDevice
|
|
from pdfminer.converter import PDFPageAggregator
|
|
from pdfminer.layout import LAParams, LTChar, LTTextLineHorizontal
|
|
|
|
|
|
def parse_text_stream(layout, t=None):
|
|
"""Recursively parse pdf layout to get a list of
|
|
LTTextHorizontal objects.
|
|
|
|
Parameters
|
|
----------
|
|
layout : object
|
|
|
|
t : list
|
|
|
|
Returns
|
|
-------
|
|
t : list
|
|
"""
|
|
if t is None:
|
|
t = []
|
|
try:
|
|
for obj in layout._objs:
|
|
if isinstance(obj, LTTextLineHorizontal):
|
|
t.append(obj)
|
|
else:
|
|
t += parse_text_stream(obj)
|
|
except AttributeError:
|
|
pass
|
|
return t
|
|
|
|
|
|
def parse_text_lattice(layout, t=None):
|
|
"""Recursively parse pdf layout to get a list of
|
|
LTChar objects.
|
|
|
|
Parameters
|
|
----------
|
|
layout : object
|
|
|
|
t : list
|
|
|
|
Returns
|
|
-------
|
|
t : list
|
|
"""
|
|
if t is None:
|
|
t = []
|
|
try:
|
|
for obj in layout._objs:
|
|
if isinstance(obj, LTChar):
|
|
t.append(obj)
|
|
else:
|
|
t += parse_text_lattice(obj)
|
|
except AttributeError:
|
|
pass
|
|
return t
|
|
|
|
|
|
def get_pdf_info(pdfname, method=None, char_margin=2.0, line_margin=0.5,
|
|
word_margin=0.1):
|
|
"""Get list of text objects along with pdf width and height.
|
|
|
|
Parameters
|
|
----------
|
|
pdfname : string
|
|
|
|
method : string
|
|
|
|
char_margin : float
|
|
|
|
line_margin : float
|
|
|
|
word_margin : float
|
|
|
|
Returns
|
|
-------
|
|
text : list
|
|
|
|
pdf_x : int
|
|
|
|
pdf_y : int
|
|
"""
|
|
if not method:
|
|
return None
|
|
with open(pdfname, 'r') as f:
|
|
parser = PDFParser(f)
|
|
document = PDFDocument(parser)
|
|
if not document.is_extractable:
|
|
raise PDFTextExtractionNotAllowed
|
|
laparams = LAParams(char_margin=char_margin,
|
|
line_margin=line_margin,
|
|
word_margin=word_margin)
|
|
rsrcmgr = PDFResourceManager()
|
|
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
|
|
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
|
for page in PDFPage.create_pages(document):
|
|
interpreter.process_page(page)
|
|
layout = device.get_result()
|
|
if method == 'stream':
|
|
text = parse_text_stream(layout)
|
|
elif method == 'lattice':
|
|
text = parse_text_lattice(layout)
|
|
pdf_x, pdf_y = layout.bbox[2], layout.bbox[3]
|
|
return text, pdf_x, pdf_y
|