camelot-py/camelot/stream.py

from __future__ import division
import os
import types
import logging
import copy_reg

import numpy as np

from .table import Table
from .utils import (get_row_index, get_score, count_empty, encode_list,
                    pdf_to_text, text_bbox)


__all__ = ['Stream']


def _reduce_method(m):
    if m.im_self is None:
        return getattr, (m.im_class, m.im_func.func_name)
    else:
        return getattr, (m.im_self, m.im_func.func_name)
copy_reg.pickle(types.MethodType, _reduce_method)


def _group_rows(text, ytol=2):
    """Groups text objects into rows using ytol.

    Parameters
    ----------
    text : list
        List of text objects.

    ytol : int
        Tolerance to account for when grouping rows
        together. (optional, default: 2)

    Returns
    -------
    rows : list
        List of grouped text rows.
    """
    row_y = 0
    rows = []
    temp = []
    for t in text:
        # is checking for upright necessary?
        # if t.get_text().strip() and all([obj.upright for obj in t._objs if
        # type(obj) is LTChar]):
        if t.get_text().strip():
            if not np.isclose(row_y, t.y0, atol=ytol):
                rows.append(sorted(temp, key=lambda t: t.x0))
                temp = []
                row_y = t.y0
            temp.append(t)
    rows.append(sorted(temp, key=lambda t: t.x0))
    __ = rows.pop(0) # hacky
    return rows


def _merge_columns(l, mtol=0):
    """Merges overlapping columns and returns list with updated
    columns boundaries.

    Parameters
    ----------
    l : list
        List of column x-coordinates.

    Returns
    -------
    merged : list
        List of merged column x-coordinates.
    """
    merged = []
    for higher in l:
        if not merged:
            merged.append(higher)
        else:
            lower = merged[-1]
            if mtol >= 0:
                if (higher[0] <= lower[1] or
                        np.isclose(higher[0], lower[1], atol=mtol)):
                    upper_bound = max(lower[1], higher[1])
                    lower_bound = min(lower[0], higher[0])
                    merged[-1] = (lower_bound, upper_bound)
                else:
                    merged.append(higher)
            elif mtol < 0:
                if higher[0] <= lower[1]:
                    if np.isclose(higher[0], lower[1], atol=abs(mtol)):
                        merged.append(higher)
                    else:
                        upper_bound = max(lower[1], higher[1])
                        lower_bound = min(lower[0], higher[0])
                        merged[-1] = (lower_bound, upper_bound)
                else:
                    merged.append(higher)
    return merged


def _get_column_index(t, columns):
    """Gets index of the column in which the given object falls by
    comparing their co-ordinates.

    Parameters
    ----------
    t : object

    columns : list

    Returns
    -------
    c : int
    """
    offset1, offset2 = 0, 0
    lt_col_overlap = []
    for c in columns:
        if c[0] <= t.x1 and c[1] >= t.x0:
            left = t.x0 if c[0] <= t.x0 else c[0]
            right = t.x1 if c[1] >= t.x1 else c[1]
            lt_col_overlap.append(abs(left - right) / abs(c[0] - c[1]))
        else:
            lt_col_overlap.append(-1)
    if len(filter(lambda x: x != -1, lt_col_overlap)) == 0:
        logging.warning("Text doesn't fit any column.")
    c_idx = lt_col_overlap.index(max(lt_col_overlap))
    if t.x0 < columns[c_idx][0]:
        offset1 = abs(t.x0 - columns[c_idx][0])
    if t.x1 > columns[c_idx][1]:
        offset2 = abs(t.x1 - columns[c_idx][1])
    Y = abs(t.y0 - t.y1)
    charea = abs(t.x0 - t.x1) * abs(t.y0 - t.y1)
    error = (Y * (offset1 + offset2)) / charea
    return c_idx, error


def _join_rows(rows_grouped, text_y_max, text_y_min):
    row_mids = [sum([(t.y0 + t.y1) / 2 for t in r]) / len(r)
                if len(r) > 0 else 0 for r in rows_grouped]
    rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))]
    rows.insert(0, text_y_max)
    rows.append(text_y_min)
    rows = [(rows[i], rows[i + 1])
            for i in range(0, len(rows) - 1)]
    return rows


def _add_columns(cols, text, ytolerance):
    if text:
        text = _group_rows(text, ytol=ytolerance)
        elements = [len(r) for r in text]
        new_cols = [(t.x0, t.x1)
            for r in text if len(r) == max(elements) for t in r]
        cols.extend(_merge_columns(sorted(new_cols)))
    return cols


def _join_columns(cols, text_x_min, text_x_max):
    cols = sorted(cols)
    cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))]
    cols.insert(0, text_x_min)
    cols.append(text_x_max)
    cols = [(cols[i], cols[i + 1])
            for i in range(0, len(cols) - 1)]
    return cols


class Stream:
    """Stream algorithm

    Groups text objects into rows and guesses number of columns
    using mode of the number of text objects in each row.

    The number of columns can be passed explicitly or specified by a
    list of column x-coordinates.

    Parameters
    ----------
    pdfobject : camelot.pdf.Pdf

    ncolumns : int
        Number of columns. (optional, default: 0)

    columns : string
        Comma-separated list of column x-coordinates.
        (optional, default: None)

    ytol : int
        Tolerance to account for when grouping rows
        together. (optional, default: 2)

    debug : bool
        Debug by visualizing textboxes. (optional, default: False)

    Attributes
    ----------
    tables : dict
        Dictionary with page number as key and list of tables on that
        page as value.
    """
    def __init__(self, table_area=None, columns=None, ncolumns=None, ytol=[2],
                 mtol=[2], margins=(2.0, 0.5, 0.1), debug=False):

        self.method = 'stream'
        self.table_area = table_area
        self.columns = columns
        self.ncolumns = ncolumns
        self.ytol = ytol
        self.mtol = mtol
        self.char_margin, self.line_margin, self.word_margin = margins
        self.debug = debug

    def get_tables(self, pdfname):
        """Returns all tables found in given pdf.

        Returns
        -------
        tables : dict
            Dictionary with page number as key and list of tables on that
            page as value.
        """
        __, text, width, height = pdf_to_text(pdfname, self.char_margin,
            self.line_margin, self.word_margin)
        bname, __ = os.path.splitext(pdfname)
        if not text:
            logging.warning("{0}: PDF has no text. It may be an image.".format(
                os.path.basename(bname)))
            return None

        if self.debug:
            self.debug_text = [(t.x0, t.y0, t.x1, t.y1) for t in text]
            return None

        if self.table_area is not None:
            if self.columns is not None:
                if len(self.table_area) != len(self.columns):
                    raise ValueError("message")
            if self.ncolumns is not None:
                if len(self.table_area) != len(self.ncolumns):
                    raise ValueError("message")
            table_bbox = {}
            for area in self.table_area:
                x1, y1, x2, y2 = area.split(",")
                x1 = int(x1)
                y1 = int(y1)
                x2 = int(x2)
                y2 = int(y2)
                table_bbox[(x1, y2, x2, y1)] = None
        else:
            table_bbox = {(0, 0, width, height): None}

        if len(self.ytol) == 1 and self.ytol[0] == 2:
            self.ytol = self.ytol * len(table_bbox)
        if len(self.mtol) == 1 and self.mtol[0] == 2:
            self.mtol = self.mtol * len(table_bbox)

        page = {}
        tables = {}
        table_no = 0
        # sort tables based on y-coord
        for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True):
            # select elements which lie within table_bbox
            table_data = {}
            t_bbox = text_bbox(k, text)
            t_bbox.sort(key=lambda x: (-x.y0, x.x0))

            rows_grouped = _group_rows(t_bbox, ytol=self.ytol[table_no])
            rows = _join_rows(rows_grouped, k[3], k[1])
            elements = [len(r) for r in rows_grouped]

            guess = False
            if self.columns is not None and self.columns[table_no] != "":
                # user has to input boundary columns too
                # take (0, width) by default
                # similar to else condition
                # len can't be 1
                cols = self.columns[table_no].split(',')
                cols = [(float(cols[i]), float(cols[i + 1]))
                        for i in range(0, len(cols) - 1)]
            else:
                if self.ncolumns is not None and self.ncolumns[table_no] != -1:
                    ncols = self.ncolumns[table_no]
                    cols = [(t.x0, t.x1)
                        for r in rows_grouped if len(r) == ncols for t in r]
                    cols = _merge_columns(sorted(cols), mtol=self.mtol[table_no])
                    if len(cols) != self.ncolumns[table_no]:
                        logging.warning("{}: The number of columns after merge"
                                      " isn't the same as what you specified."
                                      " Change the value of mtol.".format(
                                      os.path.basename(bname)))
                    cols = _join_columns(cols, k[0], k[2])
                else:
                    guess = True
                    ncols = max(set(elements), key=elements.count)
                    len_non_mode = len(filter(lambda x: x != ncols, elements))
                    if ncols == 1 and not self.debug:
                        # no tables detected
                        logging.warning("{}: Only one column was detected, the PDF"
                                      " may have no tables. Specify ncols if"
                                      " the PDF has tables.".format(
                                      os.path.basename(bname)))
                    cols = [(t.x0, t.x1)
                        for r in rows_grouped if len(r) == ncols for t in r]
                    cols = _merge_columns(sorted(cols), mtol=self.mtol[table_no])
                    inner_text = []
                    for i in range(1, len(cols)):
                        left = cols[i - 1][1]
                        right = cols[i][0]
                        inner_text.extend([t for t in t_bbox if t.x0 > left and t.x1 < right])
                    outer_text = [t for t in t_bbox if t.x0 > cols[-1][1] or t.x1 < cols[0][0]]
                    inner_text.extend(outer_text)
                    cols = _add_columns(cols, inner_text, self.ytol[table_no])
                    cols = _join_columns(cols, k[0], k[2])

            table = Table(cols, rows)
            rerror = []
            cerror = []
            for row in rows_grouped:
                for t in row:
                    try:
                        r_idx, rass_error = get_row_index(t, rows)
                    except ValueError as e:
                        # couldn't assign LTTextLH to any cell
                        continue
                    try:
                        c_idx, cass_error = _get_column_index(t, cols)
                    except ValueError as e:
                        # couldn't assign LTTextLH to any cell
                        continue
                    rerror.append(rass_error)
                    cerror.append(cass_error)
                    table.cells[r_idx][c_idx].add_text(
                        t.get_text().strip('\n'))
            if guess:
                score = get_score([[33, rerror], [33, cerror], [34, [len_non_mode / len(elements)]]])
            else:
                score = get_score([[50, rerror], [50, cerror]])

            table_data['score'] = score
            ar = encode_list(table.get_list())
            table_data['data'] = ar
            empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)
            table_data['empty_p'] = empty_p
            table_data['r_nempty_cells'] = r_nempty_cells
            table_data['c_nempty_cells'] = c_nempty_cells
            table_data['nrows'] = len(ar)
            table_data['ncols'] = len(ar[0])
            tables['table-{0}'.format(table_no + 1)] = table_data
            table_no += 1
        page[os.path.basename(bname)] = tables

        return page