camelot-py/camelot/pdf.py

165 lines
5.2 KiB
Python

import os
import shutil
import tempfile
from PyPDF2 import PdfFileReader, PdfFileWriter
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTChar, LTTextLineHorizontal
from wand.image import Image
__all__ = ['Pdf']
def _parse_page_numbers(pagenos):
"""Converts list of page ranges to a list of page numbers.
Parameters
----------
pagenos : list
List of dicts containing page ranges.
Returns
-------
page_numbers : list
List of page numbers.
"""
page_numbers = []
for p in pagenos:
page_numbers.extend(range(p['start'], p['end'] + 1))
page_numbers = sorted(set(page_numbers))
return page_numbers
def _extract_text_objects(layout, LTObject, t=None):
"""Recursively parses pdf layout to get a list of
text objects.
Parameters
----------
layout : object
Layout object.
LTObject : object
Text object, either LTChar or LTTextLineHorizontal.
t : list (optional, default: None)
Returns
-------
t : list
List of text objects.
"""
if t is None:
t = []
try:
for obj in layout._objs:
if isinstance(obj, LTObject):
t.append(obj)
else:
t += _extract_text_objects(obj, LTObject)
except AttributeError:
pass
return t
class Pdf:
"""Handles all pdf operations which include:
1. Split pdf into single page pdfs using given page numbers
2. Convert single page pdfs into images
3. Extract text from single page pdfs
Parameters
----------
pdfname : string
Path to pdf.
pagenos : list
List of dicts which specify pdf page ranges.
(optional, default: [{'start': 1, 'end': 1}])
char_margin : float
Chars closer than char_margin are grouped together to form a
word. (optional, default: 2.0)
line_margin : float
Lines closer than line_margin are grouped together to form a
textbox. (optional, default: 0.5)
word_margin : float
Insert blank spaces between chars if distance between words
is greater than word_margin. (optional, default: 0.1)
"""
def __init__(self, pdfname, pagenos=[{'start': 1, 'end': 1}],
char_margin=2.0, line_margin=0.5, word_margin=0.1,
clean=False):
self.pdfname = pdfname
self.pagenos = _parse_page_numbers(pagenos)
self.char_margin = char_margin
self.line_margin = line_margin
self.word_margin = word_margin
self.clean = clean
self.temp = tempfile.mkdtemp()
def split(self):
"""Splits pdf into single page pdfs.
"""
if not self.pdfname.endswith('.pdf'):
raise TypeError("Only PDF format is supported.")
infile = PdfFileReader(open(self.pdfname, 'rb'), strict=False)
for p in self.pagenos:
page = infile.getPage(p - 1)
outfile = PdfFileWriter()
outfile.addPage(page)
with open(os.path.join(self.temp, 'pg-{0}.pdf'.format(p)), 'wb') as f:
outfile.write(f)
def extract(self):
"""Extracts text objects, width, height from a pdf.
"""
for p in self.pagenos:
pkey = 'pg-{0}'.format(p)
pname = os.path.join(self.temp, '{}.pdf'.format(pkey))
with open(pname, 'r') as f:
parser = PDFParser(f)
document = PDFDocument(parser)
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
laparams = LAParams(char_margin=self.char_margin,
line_margin=self.line_margin,
word_margin=self.word_margin)
rsrcmgr = PDFResourceManager()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
layout = device.get_result()
lattice_objects = _extract_text_objects(layout, LTChar)
stream_objects = _extract_text_objects(
layout, LTTextLineHorizontal)
width = layout.bbox[2]
height = layout.bbox[3]
yield p, lattice_objects, stream_objects, width, height
def convert(self):
"""Converts single page pdfs to images.
"""
for p in self.pagenos:
pdfname = os.path.join(self.temp, 'pg-{0}.pdf'.format(p))
imagename = os.path.join(self.temp, 'pg-{0}.png'.format(p))
with Image(filename=pdfname, depth=8, resolution=300) as png:
png.save(filename=imagename)
def remove_tempdir(self):
shutil.rmtree(self.temp)