camelot-py/camelot/pdf.py

import os
import shutil
import logging
import tempfile
import itertools
import multiprocessing as mp

import cv2
from PyPDF2 import PdfFileReader, PdfFileWriter

from .utils import get_page_layout, get_text_objects, get_rotation


__all__ = ['Pdf']

logger = logging.getLogger("app_logger")


def _parse_page_numbers(pagenos):
    """Converts list of dicts to list of ints.

    Parameters
    ----------
    pagenos : list
        List of dicts representing page ranges. A dict must have only
        two keys named 'start' and 'end' having int as their value.

    Returns
    -------
    page_numbers : list
        List of int page numbers.
    """
    page_numbers = []
    for p in pagenos:
        page_numbers.extend(range(p['start'], p['end'] + 1))
    page_numbers = sorted(set(page_numbers))
    return page_numbers


class Pdf:
    """Pdf manager.
    Handles all operations like temp directory creation, splitting file
    into single page pdfs, running extraction using multiple processes
    and removing the temp directory.

    Parameters
    ----------
    extractor : object
        camelot.stream.Stream or camelot.lattice.Lattice extractor
        object.

    pdfname : string
        Path to pdf file.

    pagenos : list
        List of dicts representing page ranges. A dict must have only
        two keys named 'start' and 'end' having int as their value.
        (optional, default: [{'start': 1, 'end': 1}])

    parallel : bool
        Whether or not to run using multiple processes.
        (optional, default: False)

    clean : bool
        Whether or not to remove the temp directory.
        (optional, default: False)
    """

    def __init__(self, extractor, pdfname, pagenos=[{'start': 1, 'end': 1}],
                 parallel=False, clean=False):

        self.extractor = extractor
        self.pdfname = pdfname
        if not self.pdfname.endswith('.pdf'):
            raise TypeError("Only PDF format is supported right now.")
        self.pagenos = _parse_page_numbers(pagenos)
        self.parallel = parallel
        if self.parallel:
            self.cpu_count = mp.cpu_count()
            self.pool = mp.Pool(processes=self.cpu_count)
        self.clean = clean
        self.temp = tempfile.mkdtemp()

    def split(self):
        """Splits file into single page pdfs.
        """
        logger.info('Splitting pages...')
        infile = PdfFileReader(open(self.pdfname, 'rb'), strict=False)
        for p in self.pagenos:
            sp_path = os.path.join(self.temp, 'page-{0}.pdf'.format(p))
            sp_name, sp_ext = os.path.splitext(sp_path)
            page = infile.getPage(p - 1)
            outfile = PdfFileWriter()
            outfile.addPage(page)
            with open(sp_path, 'wb') as f:
                outfile.write(f)
            layout, dim = get_page_layout(sp_path)
            lttextlh = get_text_objects(layout, ltype="lh")
            lttextlv = get_text_objects(layout, ltype="lv")
            ltchar = get_text_objects(layout, ltype="char")
            rotation = get_rotation(lttextlh, lttextlv, ltchar)
            if rotation != '':
                sp_new_path = ''.join([sp_name.replace('page', 'p'), '_rotated', sp_ext])
                os.rename(sp_path, sp_new_path)
                sp_in = PdfFileReader(open(sp_new_path, 'rb'),
                    strict=False)
                sp_out = PdfFileWriter()
                sp_page = sp_in.getPage(0)
                if rotation == 'left':
                    sp_page.rotateClockwise(90)
                elif rotation == 'right':
                    sp_page.rotateCounterClockwise(90)
                sp_out.addPage(sp_page)
                with open(sp_path, 'wb') as pdf_out:
                    sp_out.write(pdf_out)


    def extract(self):
        """Runs table extraction by calling extractor.get_tables
        on all single page pdfs.
        """
        self.split()
        pages = [os.path.join(self.temp, 'page-{0}.pdf'.format(p))
                 for p in self.pagenos]
        if self.parallel:
            tables = self.pool.map(self.extractor.get_tables, pages)
            tables = {k: v for d in tables if d is not None for k, v in d.items()}
        else:
            tables = {}
            if self.extractor.debug:
                if self.extractor.method == 'stream':
                    self.debug = self.extractor.debug
                    self.debug_text = []
                elif self.extractor.method in ['lattice', 'ocr']:
                    self.debug = self.extractor.debug
                    self.debug_images = []
                    self.debug_segments = []
                    self.debug_tables = []
            for p in pages:
                table = self.extractor.get_tables(p)
                if table is not None:
                    tables.update(table)
                if self.extractor.debug:
                    if self.extractor.method == 'stream':
                        self.debug_text.append(self.extractor.debug_text)
                    elif self.extractor.method in ['lattice', 'ocr']:
                        self.debug_images.append(self.extractor.debug_images)
                        self.debug_segments.append(self.extractor.debug_segments)
                        self.debug_tables.append(self.extractor.debug_tables)
        if self.clean:
            self.remove_tempdir()
        return tables

    def remove_tempdir(self):
        """Removes temporary directory that was created to save single
        page pdfs and their images.
        """
        shutil.rmtree(self.temp)

    def debug_plot(self):
        """Generates a matplotlib plot based on the selected extractor
        debug option.
        """
        import matplotlib.pyplot as plt
        import matplotlib.patches as patches

        if self.debug is True:
            try:
                for text in self.debug_text:
                    fig = plt.figure()
                    ax = fig.add_subplot(111, aspect='equal')
                    xs, ys = [], []
                    for t in text:
                        xs.extend([t[0], t[1]])
                        ys.extend([t[2], t[3]])
                        ax.add_patch(
                            patches.Rectangle(
                                (t[0], t[1]),
                                t[2] - t[0],
                                t[3] - t[1]
                            )
                        )
                    ax.set_xlim(min(xs) - 10, max(xs) + 10)
                    ax.set_ylim(min(ys) - 10, max(ys) + 10)
                    plt.show()
            except AttributeError:
                raise ValueError("This option only be used with Stream.")
        elif self.debug == 'contour':
            try:
                for img, table_bbox in self.debug_images:
                    for t in table_bbox.keys():
                        cv2.rectangle(img, (t[0], t[1]),
                                      (t[2], t[3]), (255, 0, 0), 3)
                    plt.imshow(img)
                    plt.show()
            except AttributeError:
                raise ValueError("This option only be used with Lattice.")
        elif self.debug == 'joint':
            try:
                for img, table_bbox in self.debug_images:
                    x_coord = []
                    y_coord = []
                    for k in table_bbox.keys():
                        for coord in table_bbox[k]:
                            x_coord.append(coord[0])
                            y_coord.append(coord[1])
                    max_x, max_y = max(x_coord), max(y_coord)
                    plt.plot(x_coord, y_coord, 'ro')
                    plt.axis([0, max_x + 100, max_y + 100, 0])
                    plt.imshow(img)
                    plt.show()
            except AttributeError:
                raise ValueError("This option only be used with Lattice.")
        elif self.debug == 'line':
            try:
                for v_s, h_s in self.debug_segments:
                    for v in v_s:
                        plt.plot([v[0], v[2]], [v[1], v[3]])
                    for h in h_s:
                        plt.plot([h[0], h[2]], [h[1], h[3]])
                    plt.show()
            except AttributeError:
                raise ValueError("This option only be used with Lattice.")
        elif self.debug == 'table':
            try:
                for tables in self.debug_tables:
                    for table in tables:
                        for r in range(len(table.rows)):
                            for c in range(len(table.cols)):
                                if table.cells[r][c].left:
                                    plt.plot([table.cells[r][c].lb[0],
                                              table.cells[r][c].lt[0]],
                                             [table.cells[r][c].lb[1],
                                              table.cells[r][c].lt[1]])
                                if table.cells[r][c].right:
                                    plt.plot([table.cells[r][c].rb[0],
                                              table.cells[r][c].rt[0]],
                                             [table.cells[r][c].rb[1],
                                              table.cells[r][c].rt[1]])
                                if table.cells[r][c].top:
                                    plt.plot([table.cells[r][c].lt[0],
                                              table.cells[r][c].rt[0]],
                                             [table.cells[r][c].lt[1],
                                              table.cells[r][c].rt[1]])
                                if table.cells[r][c].bottom:
                                    plt.plot([table.cells[r][c].lb[0],
                                              table.cells[r][c].rb[0]],
                                             [table.cells[r][c].lb[1],
                                              table.cells[r][c].rb[1]])
                    plt.show()
            except AttributeError:
                raise ValueError("This option only be used with Lattice.")
        else:
            raise UserWarning("This method can only be called after"
                " debug has been specified.")