camelot-py/debug/camelot_scripts/print_text.py

83 lines
2.5 KiB
Python

"""
usage: python print_text.py file.pdf
prints horizontal and vertical text lines present in a pdf file.
"""
import sys
import time
from pprint import pprint
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfdevice import PDFDevice
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.converter import PDFPageAggregator
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.layout import (LAParams, LTChar, LTAnno, LTTextBoxHorizontal,
LTTextLineHorizontal, LTTextLineVertical, LTLine)
def timeit(func):
def timed(*args, **kw):
start = time.time()
result = func(*args, **kw)
end = time.time()
print 'Function: %r took: %2.4f seconds' % (func.__name__, end - start)
return result
return timed
def extract_text_objects(layout, LTObject, t=None):
if t is None:
t = []
try:
for obj in layout._objs:
if isinstance(obj, LTObject):
t.append(obj)
else:
t += extract_text_objects(obj, LTObject)
except AttributeError:
pass
return t
@timeit
def main():
with open(sys.argv[1], 'rb') as f:
parser = PDFParser(f)
document = PDFDocument(parser)
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
# 2.0, 0.5, 0.1
kwargs = {
'char_margin': 1.0,
'line_margin': 0.5,
'word_margin': 0.1,
'detect_vertical': True
}
laparams = LAParams(**kwargs)
rsrcmgr = PDFResourceManager()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
layout = device.get_result()
lh = extract_text_objects(layout, LTTextLineHorizontal)
lv = extract_text_objects(layout, LTTextLineVertical)
print "number of horizontal text lines -> {0}".format(len(lh))
print "horizontal text lines ->"
pprint([t.get_text() for t in lh])
print "number of vertical text lines -> {0}".format(len(lv))
print "vertical text lines ->"
pprint([t.get_text() for t in lv])
if __name__ == '__main__':
if len(sys.argv) == 1:
print __doc__
else:
main()