Move Pdf class to core as FileHandler

2018-09-04 07:02:30 +05:30 · 2018-09-04 07:02:30 +05:30 · 5d29f0c21c
parent 0c9e21d881
commit 5d29f0c21c
2 changed files with 79 additions and 411 deletions
--- a/camelot/core.py
+++ b/camelot/core.py
@ -1,69 +1,14 @@
+import os
+import tempfile
+
 import numpy as np
 import pandas as pd
+from PyPDF2 import PdfFileReader, PdfFileWriter
+
+from .utils import get_page_layout, get_text_objects, get_rotation


 class Cell(object):
-    """Cell.
-    Defines a cell object with coordinates relative to a left-bottom
-    origin, which is also PDFMiner's coordinate space.
-
-    Parameters
-    ----------
-    x1 : float
-        x-coordinate of left-bottom point.
-
-    y1 : float
-        y-coordinate of left-bottom point.
-
-    x2 : float
-        x-coordinate of right-top point.
-
-    y2 : float
-        y-coordinate of right-top point.
-
-    Attributes
-    ----------
-    lb : tuple
-        Tuple representing left-bottom coordinates.
-
-    lt : tuple
-        Tuple representing left-top coordinates.
-
-    rb : tuple
-        Tuple representing right-bottom coordinates.
-
-    rt : tuple
-        Tuple representing right-top coordinates.
-
-    bbox : tuple
-        Tuple representing the cell's bounding box using the
-        lower-bottom and right-top coordinates.
-
-    left : bool
-        Whether or not cell is bounded on the left.
-
-    right : bool
-        Whether or not cell is bounded on the right.
-
-    top : bool
-        Whether or not cell is bounded on the top.
-
-    bottom : bool
-        Whether or not cell is bounded on the bottom.
-
-    text_objects : list
-        List of text objects assigned to cell.
-
-    text : string
-        Text assigned to cell.
-
-    spanning_h : bool
-        Whether or not cell spans/extends horizontally.
-
-    spanning_v : bool
-        Whether or not cell spans/extends vertically.
-    """
-
    def __init__(self, x1, y1, x2, y2):

        self.x1 = x1
@ -86,76 +31,23 @@ class Cell(object):
        self.image = None

    def add_text(self, text):
-        """Adds text to cell.
-
-        Parameters
-        ----------
-        text : string
-        """
        self.text = ''.join([self.text, text])

    def get_text(self):
-        """Returns text assigned to cell.
-
-        Returns
-        -------
-        text : string
-        """
        return self.text

    def add_object(self, t_object):
-        """Adds PDFMiner text object to cell.
-
-        Parameters
-        ----------
-        t_object : object
-        """
        self.text_objects.append(t_object)

    def get_objects(self):
-        """Returns list of text objects assigned to cell.
-
-        Returns
-        -------
-        text_objects : list
-        """
        return self.text_objects

    def get_bounded_edges(self):
-        """Returns the number of edges by which a cell is bounded.
-
-        Returns
-        -------
-        bounded_edges : int
-        """
        self.bounded_edges = self.top + self.bottom + self.left + self.right
        return self.bounded_edges


 class Table(object):
-    """Table.
-    Defines a table object with coordinates relative to a left-bottom
-    origin, which is also PDFMiner's coordinate space.
-
-    Parameters
-    ----------
-    cols : list
-        List of tuples representing column x-coordinates in increasing
-        order.
-
-    rows : list
-        List of tuples representing row y-coordinates in decreasing
-        order.
-
-    Attributes
-    ----------
-    cells : list
-        List of cell objects with row-major ordering.
-
-    nocont_ : int
-        Number of lines that did not contribute to setting cell edges.
-    """
-
    def __init__(self, cols, rows):

        self.cols = cols
@ -166,8 +58,6 @@ class Table(object):
        self.image = None

    def set_all_edges(self):
-        """Sets all table edges to True.
-        """
        for r in range(len(self.rows)):
            for c in range(len(self.cols)):
                self.cells[r][c].left = True
@ -177,8 +67,6 @@ class Table(object):
        return self

    def set_border_edges(self):
-        """Sets table border edges to True.
-        """
        for r in range(len(self.rows)):
            self.cells[r][0].left = True
            self.cells[r][len(self.cols) - 1].right = True
@ -188,19 +76,6 @@ class Table(object):
        return self

    def set_edges(self, vertical, horizontal, jtol=2):
-        """Sets a cell's edges to True depending on whether they
-        overlap with lines found by imgproc.
-
-        Parameters
-        ----------
-        vertical : list
-            List of vertical lines detected by imgproc. Coordinates
-            scaled and translated to the PDFMiner's coordinate space.
-
-        horizontal : list
-            List of horizontal lines detected by imgproc. Coordinates
-            scaled and translated to the PDFMiner's coordinate space.
-        """
        for v in vertical:
            # find closest x coord
            # iterate over y coords and find closest points
@ -308,10 +183,6 @@ class Table(object):
        return self

    def set_spanning(self):
-        """Sets a cell's spanning_h or spanning_v attribute to True
-        depending on whether the cell spans/extends horizontally or
-        vertically.
-        """
        for r in range(len(self.rows)):
            for c in range(len(self.cols)):
                bound = self.cells[r][c].get_bounded_edges()
@ -351,13 +222,6 @@ class Table(object):
        return self

    def get_list(self):
-        """Returns a two-dimensional list of text assigned to each
-        cell.
-
-        Returns
-        -------
-        ar : list
-        """
        ar = []
        for r in range(len(self.rows)):
            ar.append([self.cells[r][c].get_text().strip()
@ -367,3 +231,75 @@ class Table(object):

 class TableSet(object):
    pass
+
+
+class FileHandler(object):
+    def __init__(self, filename, pages='1'):
+        self.filename = filename
+        if not self.filename.endswith('.pdf'):
+            raise TypeError("File format not supported.")
+        self.pages = __get_pages(pages)
+        self.temp = tempfile.mkdtemp()
+
+    @staticmethod
+    def __get_pages(filename, pages):
+        p = {}
+        if pages == '1':
+            p.append({'start': 1, 'end': 1})
+        else:
+            infile = PdfFileReader(open(filename, 'rb'), strict=False)
+            if pages == 'all':
+                p.append({'start': 1, 'end': infile.getNumPages()})
+            else:
+                for r in pages.split(','):
+                    if '-' in r:
+                        a, b = r.split('-')
+                        if b == 'end':
+                            b = infile.getNumPages()
+                        p.append({'start': int(a), 'end': int(b)})
+                    else:
+                        p.append({'start': int(r), 'end': int(r)})
+        return p
+
+    @staticmethod
+    def __save_page(filename, page, temp):
+        with open(filename, 'rb') as fileobj:
+            infile = PdfFileReader(fileobj, strict=False)
+            fpath = os.path.join(temp, 'page-{0}.pdf'.format(page))
+            fname, fext = os.path.splitext(fpath)
+            p = infile.getPage(page - 1)
+            outfile = PdfFileWriter()
+            outfile.addPage(p)
+            with open(fpath, 'wb') as f:
+                outfile.write(f)
+            layout, dim = get_page_layout(fpath)
+            # fix rotated pdf
+            lttextlh = get_text_objects(layout, ltype="lh")
+            lttextlv = get_text_objects(layout, ltype="lv")
+            ltchar = get_text_objects(layout, ltype="char")
+            rotation = get_rotation(lttextlh, lttextlv, ltchar)
+            if rotation != '':
+                fpath_new = ''.join([fname.replace('page', 'p'), '_rotated', fext])
+                os.rename(fpath, fpath_new)
+                infile = PdfFileReader(open(fpath_new, 'rb'), strict=False)
+                outfile = PdfFileWriter()
+                p = infile.getPage(0)
+                if rotation == 'left':
+                    p.rotateClockwise(90)
+                elif rotation == 'right':
+                    p.rotateCounterClockwise(90)
+                outfile.addPage(p)
+                with open(fpath, 'wb') as f:
+                    outfile.write(f)
+
+    def parse(self):
+        for p in self.pages:
+            __save_page(self.filename, p, self.temp)
+        pages = [os.path.join(self.temp, 'page-{0}.pdf'.format(p))
+                 for p in self.pagenos]
+        tables = {}
+        for p in pages:
+            table = self.parser.get_tables(p)
+            if table is not None:
+                tables.update(table)
+        return tables
--- a/camelot/pdf.py
+++ b/camelot/pdf.py
@ -1,268 +0,0 @@
-import os
-import shutil
-import tempfile
-import itertools
-import multiprocessing as mp
-from functools import partial
-
-import cv2
-from PyPDF2 import PdfFileReader, PdfFileWriter
-
-from .utils import get_page_layout, get_text_objects, get_rotation
-
-
-__all__ = ['Pdf']
-
-
-def _parse_page_numbers(pagenos):
-    """Converts list of dicts to list of ints.
-
-    Parameters
-    ----------
-    pagenos : list
-        List of dicts representing page ranges. A dict must have only
-        two keys named 'start' and 'end' having int as their value.
-
-    Returns
-    -------
-    page_numbers : list
-        List of int page numbers.
-    """
-    page_numbers = []
-    for p in pagenos:
-        page_numbers.extend(range(p['start'], p['end'] + 1))
-    page_numbers = sorted(set(page_numbers))
-    return page_numbers
-
-
-def _save_page(temp, pdfname, pageno):
-    with open(pdfname, 'rb') as pdffile:
-        infile = PdfFileReader(pdffile, strict=False)
-        sp_path = os.path.join(temp, 'page-{0}.pdf'.format(pageno))
-        sp_name, sp_ext = os.path.splitext(sp_path)
-        page = infile.getPage(pageno - 1)
-        outfile = PdfFileWriter()
-        outfile.addPage(page)
-        with open(sp_path, 'wb') as f:
-            outfile.write(f)
-        layout, dim = get_page_layout(sp_path)
-        lttextlh = get_text_objects(layout, ltype="lh")
-        lttextlv = get_text_objects(layout, ltype="lv")
-        ltchar = get_text_objects(layout, ltype="char")
-        rotation = get_rotation(lttextlh, lttextlv, ltchar)
-        if rotation != '':
-            sp_new_path = ''.join([sp_name.replace('page', 'p'), '_rotated', sp_ext])
-            os.rename(sp_path, sp_new_path)
-            sp_in = PdfFileReader(open(sp_new_path, 'rb'),
-                strict=False)
-            sp_out = PdfFileWriter()
-            sp_page = sp_in.getPage(0)
-            if rotation == 'left':
-                sp_page.rotateClockwise(90)
-            elif rotation == 'right':
-                sp_page.rotateCounterClockwise(90)
-            sp_out.addPage(sp_page)
-            with open(sp_path, 'wb') as pdf_out:
-                sp_out.write(pdf_out)
-
-
-class Pdf:
-    """Pdf manager.
-    Handles all operations like temp directory creation, splitting file
-    into single page pdfs, running extraction using multiple processes
-    and removing the temp directory.
-
-    Parameters
-    ----------
-    extractor : object
-        camelot.stream.Stream or camelot.lattice.Lattice extractor
-        object.
-
-    pdfname : string
-        Path to pdf file.
-
-    pagenos : list
-        List of dicts representing page ranges. A dict must have only
-        two keys named 'start' and 'end' having int as their value.
-        (optional, default: [{'start': 1, 'end': 1}])
-
-    parallel : bool
-        Whether or not to run using multiple processes.
-        (optional, default: False)
-
-    clean : bool
-        Whether or not to remove the temp directory.
-        (optional, default: False)
-    """
-
-    def __init__(self, extractor, pdfname, pagenos=[{'start': 1, 'end': 1}],
-                 parallel=False, clean=False):
-
-        self.extractor = extractor
-        self.pdfname = pdfname
-        if not self.pdfname.endswith('.pdf'):
-            raise TypeError("File format not supported.")
-        self.pagenos = _parse_page_numbers(pagenos)
-        self.parallel = parallel
-        if self.parallel:
-            self.cpu_count = mp.cpu_count()
-            self.pool = mp.Pool(processes=self.cpu_count)
-        self.clean = clean
-        self.temp = tempfile.mkdtemp()
-
-    def split(self):
-        """Splits file into single page pdfs.
-        """
-        if self.parallel:
-            pfunc = partial(_save_page, self.temp, self.pdfname)
-            self.pool.map(pfunc, self.pagenos)
-        else:
-            for p in self.pagenos:
-                _save_page(self.temp, self.pdfname, p)
-
-
-    def extract(self):
-        """Runs table extraction by calling extractor.get_tables
-        on all single page pdfs.
-        """
-        self.split()
-        pages = [os.path.join(self.temp, 'page-{0}.pdf'.format(p))
-                 for p in self.pagenos]
-        if self.parallel:
-            tables = self.pool.map(self.extractor.get_tables, pages)
-            tables = {k: v for d in tables if d is not None for k, v in d.items()}
-        else:
-            tables = {}
-            if self.extractor.debug:
-                if self.extractor.method == 'stream':
-                    self.debug = self.extractor.debug
-                    self.debug_text = []
-                elif self.extractor.method in ['lattice', 'ocrl']:
-                    self.debug = self.extractor.debug
-                    self.debug_images = []
-                    self.debug_segments = []
-                    self.debug_tables = []
-                elif self.extractor.method == 'ocrs':
-                    self.debug = self.extractor.debug
-                    self.debug_images = []
-            for p in pages:
-                table = self.extractor.get_tables(p)
-                if table is not None:
-                    tables.update(table)
-                if self.extractor.debug:
-                    if self.extractor.method == 'stream':
-                        self.debug_text.append(self.extractor.debug_text)
-                    elif self.extractor.method in ['lattice', 'ocr']:
-                        self.debug_images.append(self.extractor.debug_images)
-                        self.debug_segments.append(self.extractor.debug_segments)
-                        self.debug_tables.append(self.extractor.debug_tables)
-                    elif self.extractor.method == 'ocrs':
-                        self.debug_images.append(self.extractor.debug_images)
-        if self.clean:
-            self.remove_tempdir()
-        return tables
-
-    def remove_tempdir(self):
-        """Removes temporary directory that was created to save single
-        page pdfs and their images.
-        """
-        shutil.rmtree(self.temp)
-
-    def debug_plot(self):
-        """Generates a matplotlib plot based on the selected extractor
-        debug option.
-        """
-        import matplotlib.pyplot as plt
-        import matplotlib.patches as patches
-
-        if self.debug is True:
-            if hasattr(self, 'debug_text'):
-                for text in self.debug_text:
-                    fig = plt.figure()
-                    ax = fig.add_subplot(111, aspect='equal')
-                    xs, ys = [], []
-                    for t in text:
-                        xs.extend([t[0], t[1]])
-                        ys.extend([t[2], t[3]])
-                        ax.add_patch(
-                            patches.Rectangle(
-                                (t[0], t[1]),
-                                t[2] - t[0],
-                                t[3] - t[1]
-                            )
-                        )
-                    ax.set_xlim(min(xs) - 10, max(xs) + 10)
-                    ax.set_ylim(min(ys) - 10, max(ys) + 10)
-                    plt.show()
-            elif hasattr(self, 'debug_images'):
-                for img in self.debug_images:
-                    plt.imshow(img)
-                    plt.show()
-        elif self.debug == 'contour':
-            try:
-                for img, table_bbox in self.debug_images:
-                    for t in table_bbox.keys():
-                        cv2.rectangle(img, (t[0], t[1]),
-                                      (t[2], t[3]), (255, 0, 0), 3)
-                    plt.imshow(img)
-                    plt.show()
-            except AttributeError:
-                raise ValueError("This option can only be used with Lattice.")
-        elif self.debug == 'joint':
-            try:
-                for img, table_bbox in self.debug_images:
-                    x_coord = []
-                    y_coord = []
-                    for k in table_bbox.keys():
-                        for coord in table_bbox[k]:
-                            x_coord.append(coord[0])
-                            y_coord.append(coord[1])
-                    max_x, max_y = max(x_coord), max(y_coord)
-                    plt.plot(x_coord, y_coord, 'ro')
-                    plt.axis([0, max_x + 100, max_y + 100, 0])
-                    plt.imshow(img)
-                    plt.show()
-            except AttributeError:
-                raise ValueError("This option can only be used with Lattice.")
-        elif self.debug == 'line':
-            try:
-                for v_s, h_s in self.debug_segments:
-                    for v in v_s:
-                        plt.plot([v[0], v[2]], [v[1], v[3]])
-                    for h in h_s:
-                        plt.plot([h[0], h[2]], [h[1], h[3]])
-                    plt.show()
-            except AttributeError:
-                raise ValueError("This option can only be used with Lattice.")
-        elif self.debug == 'table':
-            try:
-                for tables in self.debug_tables:
-                    for table in tables:
-                        for r in range(len(table.rows)):
-                            for c in range(len(table.cols)):
-                                if table.cells[r][c].left:
-                                    plt.plot([table.cells[r][c].lb[0],
-                                              table.cells[r][c].lt[0]],
-                                             [table.cells[r][c].lb[1],
-                                              table.cells[r][c].lt[1]])
-                                if table.cells[r][c].right:
-                                    plt.plot([table.cells[r][c].rb[0],
-                                              table.cells[r][c].rt[0]],
-                                             [table.cells[r][c].rb[1],
-                                              table.cells[r][c].rt[1]])
-                                if table.cells[r][c].top:
-                                    plt.plot([table.cells[r][c].lt[0],
-                                              table.cells[r][c].rt[0]],
-                                             [table.cells[r][c].lt[1],
-                                              table.cells[r][c].rt[1]])
-                                if table.cells[r][c].bottom:
-                                    plt.plot([table.cells[r][c].lb[0],
-                                              table.cells[r][c].rb[0]],
-                                             [table.cells[r][c].lb[1],
-                                              table.cells[r][c].rb[1]])
-                    plt.show()
-            except AttributeError:
-                raise ValueError("This option can only be used with Lattice.")
-        else:
-            raise UserWarning("This method can only be called after"
-                " debug has been specified.")