camelot-py/camelot/handlers.py

159 lines
5.4 KiB
Python

# -*- coding: utf-8 -*-
import os
import sys
from PyPDF2 import PdfFileReader, PdfFileWriter
from .core import TableList
from .parsers import Stream, Lattice
from .utils import (TemporaryDirectory, get_page_layout, get_text_objects,
get_rotation)
class PDFHandler(object):
"""Handles all operations like temp directory creation, splitting
file into single page PDFs, parsing each PDF and then removing the
temp directory.
Parameters
----------
filename : str
Path to PDF file.
pages : str, optional (default: '1')
Comma-separated page numbers.
Example: '1,3,4' or '1,4-end'.
password : str, optional (default: None)
Password for decryption.
"""
def __init__(self, filename, pages='1', password=None):
self.filename = filename
if not filename.lower().endswith('.pdf'):
raise NotImplementedError("File format not supported")
self.pages = self._get_pages(self.filename, pages)
if password is None:
self.password = ''
else:
self.password = password
if sys.version_info[0] < 3:
self.password = self.password.encode('ascii')
def _get_pages(self, filename, pages):
"""Converts pages string to list of ints.
Parameters
----------
filename : str
Path to PDF file.
pages : str, optional (default: '1')
Comma-separated page numbers.
Example: 1,3,4 or 1,4-end.
Returns
-------
P : list
List of int page numbers.
"""
page_numbers = []
if pages == '1':
page_numbers.append({'start': 1, 'end': 1})
else:
infile = PdfFileReader(open(filename, 'rb'), strict=False)
if infile.isEncrypted:
infile.decrypt(self.password)
if pages == 'all':
page_numbers.append({'start': 1, 'end': infile.getNumPages()})
else:
for r in pages.split(','):
if '-' in r:
a, b = r.split('-')
if b == 'end':
b = infile.getNumPages()
page_numbers.append({'start': int(a), 'end': int(b)})
else:
page_numbers.append({'start': int(r), 'end': int(r)})
P = []
for p in page_numbers:
P.extend(range(p['start'], p['end'] + 1))
return sorted(set(P))
def _save_page(self, filename, page, temp):
"""Saves specified page from PDF into a temporary directory.
Parameters
----------
filename : str
Path to PDF file.
page : int
Page number.
temp : str
Tmp directory.
"""
with open(filename, 'rb') as fileobj:
infile = PdfFileReader(fileobj, strict=False)
if infile.isEncrypted:
infile.decrypt(self.password)
fpath = os.path.join(temp, 'page-{0}.pdf'.format(page))
froot, fext = os.path.splitext(fpath)
p = infile.getPage(page - 1)
outfile = PdfFileWriter()
outfile.addPage(p)
with open(fpath, 'wb') as f:
outfile.write(f)
layout, dim = get_page_layout(fpath)
# fix rotated PDF
lttextlh = get_text_objects(layout, ltype="lh")
lttextlv = get_text_objects(layout, ltype="lv")
ltchar = get_text_objects(layout, ltype="char")
rotation = get_rotation(lttextlh, lttextlv, ltchar)
if rotation != '':
fpath_new = ''.join([froot.replace('page', 'p'), '_rotated', fext])
os.rename(fpath, fpath_new)
infile = PdfFileReader(open(fpath_new, 'rb'), strict=False)
if infile.isEncrypted:
infile.decrypt(self.password)
outfile = PdfFileWriter()
p = infile.getPage(0)
if rotation == 'anticlockwise':
p.rotateClockwise(90)
elif rotation == 'clockwise':
p.rotateCounterClockwise(90)
outfile.addPage(p)
with open(fpath, 'wb') as f:
outfile.write(f)
def parse(self, flavor='lattice', suppress_stdout=False, **kwargs):
"""Extracts tables by calling parser.get_tables on all single
page PDFs.
Parameters
----------
flavor : str (default: 'lattice')
The parsing method to use ('lattice' or 'stream').
Lattice is used by default.
suppress_stdout : str (default: False)
Suppress logs and warnings.
kwargs : dict
See camelot.read_pdf kwargs.
Returns
-------
tables : camelot.core.TableList
List of tables found in PDF.
"""
tables = []
with TemporaryDirectory() as tempdir:
for p in self.pages:
self._save_page(self.filename, p, tempdir)
pages = [os.path.join(tempdir, 'page-{0}.pdf'.format(p))
for p in self.pages]
parser = Lattice(**kwargs) if flavor == 'lattice' else Stream(**kwargs)
for p in pages:
t = parser.extract_tables(p, suppress_stdout=suppress_stdout)
tables.extend(t)
return TableList(tables)