From 9124e3374c5d0aec61b53a7c7725d3839effacdd Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Wed, 5 Sep 2018 18:20:46 +0530 Subject: [PATCH] Add properties to Table --- camelot/core.py | 70 +++++++- camelot/handlers.py | 3 +- camelot/parsers.py | 405 +++++--------------------------------------- camelot/utils.py | 4 +- 4 files changed, 110 insertions(+), 372 deletions(-) diff --git a/camelot/core.py b/camelot/core.py index ec74eb3..9089593 100644 --- a/camelot/core.py +++ b/camelot/core.py @@ -1,5 +1,4 @@ import numpy as np -import pandas as pd class Cell(object): @@ -48,9 +47,15 @@ class Table(object): self.rows = rows self.cells = [[Cell(c[0], r[1], c[1], r[0]) for c in cols] for r in rows] + self._df = None + self._shape = (0, 0) + self._accuracy = 0 + self._whitespace = 0 + self._order = None + self._page = None def __repr__(self): - pass + return '<{} shape={}>'.format(self.__class__.__name__, self._shape) def set_all_edges(self): for r in range(len(self.rows)): @@ -216,12 +221,61 @@ class Table(object): return self - def get_list(self): - ar = [] + @property + def data(self): + d = [] for r in range(len(self.rows)): - ar.append([self.cells[r][c].get_text().strip() + d.append([self.cells[r][c].get_text().strip() for c in range(len(self.cols))]) - return ar + return d + + @property + def df(self): + return self._df + + @df.setter + def df(self, dataframe): + self._df = dataframe + + @property + def shape(self): + return self._shape + + @shape.setter + def shape(self, s): + self._shape = s + + @property + def accuracy(self): + return self._accuracy + + @accuracy.setter + def accuracy(self, a): + self._accuracy = a + + @property + def whitespace(self): + return self._whitespace + + @whitespace.setter + def whitespace(self, w): + self._whitespace = w + + @property + def order(self): + return self._order + + @order.setter + def order(self, o): + self._order = o + + @property + def page(self): + return self._page + + @page.setter + def page(self, p): + self._page = p class TableList(list): @@ -236,8 +290,8 @@ class TableList(list): class Geometry(object): def __init__(self): self._text = [] - self._images = [] - self._segments = [] + self._images = () + self._segments = () self._tables = [] @property diff --git a/camelot/handlers.py b/camelot/handlers.py index 35f14f4..dce3b05 100644 --- a/camelot/handlers.py +++ b/camelot/handlers.py @@ -3,6 +3,7 @@ import tempfile from PyPDF2 import PdfFileReader, PdfFileWriter +from .core import TableList, GeometryList from .parsers import Stream, Lattice from .utils import get_page_layout, get_text_objects, get_rotation @@ -80,5 +81,5 @@ class PDFHandler(object): for p in pages: t, g = parser.extract_tables(p) tables.extend(t) - geometry.extend(g) + geometry.append(g) return TableList(tables), GeometryList(geometry) \ No newline at end of file diff --git a/camelot/parsers.py b/camelot/parsers.py index 205743c..20820da 100644 --- a/camelot/parsers.py +++ b/camelot/parsers.py @@ -9,13 +9,14 @@ import warnings import subprocess import numpy as np +import pandas as pd from .core import Table, Geometry from .image_processing import (adaptive_threshold, find_lines, find_table_contours, find_table_joints) from .utils import (scale_to_pdf, scale_to_image, segments_bbox, text_in_bbox, - merge_close_values, get_table_index, get_score, count_empty, - encode_list, get_text_objects, get_page_layout) + merge_close_values, get_table_index, compute_accuracy, count_empty, + get_text_objects, get_page_layout, encode_) __all__ = ['Stream', 'Lattice'] @@ -31,58 +32,6 @@ copy_reg.pickle(types.MethodType, _reduce_method) class Stream: - """Stream looks for spaces between text elements to form a table. - - If you want to give columns, ytol or mtol for each table - when specifying multiple table areas, make sure that their length - is equal to the length of table_area. Mapping between them is based - on index. - - If you don't want to specify columns for the some tables in a pdf - page having multiple tables, pass them as empty strings. - For example: ['', 'x1,x2,x3,x4', ''] - - Parameters - ---------- - table_area : list - List of strings of the form x1,y1,x2,y2 where - (x1, y1) -> left-top and (x2, y2) -> right-bottom in PDFMiner's - coordinate space, denoting table areas to analyze. - (optional, default: None) - - columns : list - List of strings where each string is comma-separated values of - x-coordinates in PDFMiner's coordinate space. - (optional, default: None) - - ytol : list - List of ints specifying the y-tolerance parameters. - (optional, default: [2]) - - mtol : list - List of ints specifying the m-tolerance parameters. - (optional, default: [0]) - - margins : tuple - PDFMiner margins. (char_margin, line_margin, word_margin) - (optional, default: (1.0, 0.5, 0.1)) - - split_text : bool - Whether or not to split a text line if it spans across - different cells. - (optional, default: False) - - flag_size : bool - Whether or not to highlight a substring using - if its size is different from rest of the string, useful for - super and subscripts. - (optional, default: True) - - debug : bool - Set to True to generate a matplotlib plot of - LTTextLineHorizontals in order to select table_area, columns. - (optional, default: False) - """ def __init__(self, table_area=None, columns=None, ytol=[2], mtol=[0], margins=(1.0, 0.5, 0.1), split_text=False, flag_size=True, debug=False): @@ -99,20 +48,6 @@ class Stream: @staticmethod def _text_bbox(t_bbox): - """Returns bounding box for the text present on a page. - - Parameters - ---------- - t_bbox : dict - Dict with two keys 'horizontal' and 'vertical' with lists of - LTTextLineHorizontals and LTTextLineVerticals respectively. - - Returns - ------- - text_bbox : tuple - Tuple of the form (x0, y0, x1, y1) in PDFMiner's coordinate - space. - """ xmin = min([t.x0 for direction in t_bbox for t in t_bbox[direction]]) ymin = min([t.y0 for direction in t_bbox for t in t_bbox[direction]]) xmax = max([t.x1 for direction in t_bbox for t in t_bbox[direction]]) @@ -122,23 +57,6 @@ class Stream: @staticmethod def _group_rows(text, ytol=2): - """Groups PDFMiner text objects into rows using their - y-coordinates taking into account some tolerance ytol. - - Parameters - ---------- - text : list - List of PDFMiner text objects. - - ytol : int - Tolerance parameter. - (optional, default: 2) - - Returns - ------- - rows : list - Two-dimensional list of text objects grouped into rows. - """ row_y = 0 rows = [] temp = [] @@ -158,23 +76,6 @@ class Stream: @staticmethod def _merge_columns(l, mtol=0): - """Merges column boundaries if they overlap or lie within some - tolerance mtol. - - Parameters - ---------- - l : list - List of column coordinate tuples. - - mtol : int - TODO - (optional, default: 0) - - Returns - ------- - merged : list - List of merged column coordinate tuples. - """ merged = [] for higher in l: if not merged: @@ -203,22 +104,6 @@ class Stream: @staticmethod def _join_rows(rows_grouped, text_y_max, text_y_min): - """Makes row coordinates continuous. - - Parameters - ---------- - rows_grouped : list - Two-dimensional list of text objects grouped into rows. - - text_y_max : int - - text_y_min : int - - Returns - ------- - rows : list - List of continuous row coordinate tuples. - """ row_mids = [sum([(t.y0 + t.y1) / 2 for t in r]) / len(r) if len(r) > 0 else 0 for r in rows_grouped] rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))] @@ -230,25 +115,6 @@ class Stream: @staticmethod def _add_columns(cols, text, ytol): - """Adds columns to existing list by taking into account - the text that lies outside the current column coordinates. - - Parameters - ---------- - cols : list - List of column coordinate tuples. - - text : list - List of PDFMiner text objects. - - ytol : int - Tolerance parameter. - - Returns - ------- - cols : list - Updated list of column coordinate tuples. - """ if text: text = Stream._group_rows(text, ytol=ytol) elements = [len(r) for r in text] @@ -259,22 +125,6 @@ class Stream: @staticmethod def _join_columns(cols, text_x_min, text_x_max): - """Makes column coordinates continuous. - - Parameters - ---------- - cols : list - List of column coordinate tuples. - - text_x_min : int - - text_y_max : int - - Returns - ------- - cols : list - Updated list of column coordinate tuples. - """ cols = sorted(cols) cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))] cols.insert(0, text_x_min) @@ -284,17 +134,6 @@ class Stream: return cols def extract_tables(self, pdfname): - """Expects a single page pdf as input with rotation corrected. - - Parameters - --------- - pdfname : string - Path to single page pdf file. - - Returns - ------- - page : dict - """ layout, dim = get_page_layout(pdfname, char_margin=self.char_margin, line_margin=self.line_margin, word_margin=self.word_margin) lttextlh = get_text_objects(layout, ltype="lh") @@ -314,7 +153,6 @@ class Stream: text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlh]) text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlv]) g.text = text - return [None], [g] if self.table_area is not None: if self.columns is not None: @@ -343,17 +181,13 @@ class Stream: else: mtolerance = copy.deepcopy(self.mtol) - page = {} - tables = {} + _tables = [] # sort tables based on y-coord for table_no, k in enumerate(sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True)): # select elements which lie within table_bbox - table_data = {} t_bbox = {} t_bbox['horizontal'] = text_in_bbox(k, lttextlh) t_bbox['vertical'] = text_in_bbox(k, lttextlv) - char_bbox = text_in_bbox(k, ltchar) - table_data['text_p'] = 100 * (1 - (len(char_bbox) / len(ltchar))) for direction in t_bbox: t_bbox[direction].sort(key=lambda x: (-x.y0, x.x0)) text_x_min, text_y_min, text_x_max, text_y_max = self._text_bbox(t_bbox) @@ -399,124 +233,38 @@ class Stream: table = Table(cols, rows) table = table.set_all_edges() - assignment_errors = [] - table_data['split_text'] = [] - table_data['superscript'] = [] + pos_errors = [] for direction in t_bbox: for t in t_bbox[direction]: indices, error = get_table_index( table, t, direction, split_text=self.split_text, flag_size=self.flag_size) - assignment_errors.append(error) - if len(indices) > 1: - table_data['split_text'].append(indices) - for r_idx, c_idx, text in indices: - if all(s in text for s in ['', '']): - table_data['superscript'].append((r_idx, c_idx, text)) - table.cells[r_idx][c_idx].add_text(text) + if indices[:2] != (-1, -1): + pos_errors.append(error) + for r_idx, c_idx, text in indices: + table.cells[r_idx][c_idx].add_text(text) if guess: - score = get_score([[66, assignment_errors], [34, [len_non_mode / len(elements)]]]) + accuracy = compute_accuracy([[66, pos_errors], [34, [len_non_mode / len(elements)]]]) else: - score = get_score([[100, assignment_errors]]) + accuracy = compute_accuracy([[100, pos_errors]]) - table_data['score'] = score - ar = table.get_list() - ar = encode_list(ar) - table_data['data'] = ar - empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar) - table_data['empty_p'] = empty_p - table_data['r_nempty_cells'] = r_nempty_cells - table_data['c_nempty_cells'] = c_nempty_cells - table_data['nrows'] = len(ar) - table_data['ncols'] = len(ar[0]) - tables['table-{0}'.format(table_no + 1)] = table_data - page[os.path.basename(bname)] = tables + data = table.data + data = encode_(data) + table.df = pd.DataFrame(data) + table.shape = table.df.shape - return page + whitespace, __, __ = count_empty(data) + table.accuracy = accuracy + table.whitespace = whitespace + table.order = table_no + 1 + table.page = os.path.basename(bname).replace('page-', '') + + _tables.append(table) + + return _tables, g class Lattice: - """Lattice looks for lines in the pdf to form a table. - - If you want to give fill and mtol for each table when specifying - multiple table areas, make sure that the length of fill and mtol - is equal to the length of table_area. Mapping between them is based - on index. - - Parameters - ---------- - table_area : list - List of strings of the form x1,y1,x2,y2 where - (x1, y1) -> left-top and (x2, y2) -> right-bottom in PDFMiner's - coordinate space, denoting table areas to analyze. - (optional, default: None) - - fill : list - List of strings specifying directions to fill spanning cells. - {'h', 'v'} to fill spanning cells in horizontal or vertical - direction. - (optional, default: None) - - mtol : list - List of ints specifying m-tolerance parameters. - (optional, default: [2]) - - jtol : list - List of ints specifying j-tolerance parameters. - (optional, default: [2]) - - blocksize : int - Size of a pixel neighborhood that is used to calculate a - threshold value for the pixel: 3, 5, 7, and so on. - (optional, default: 15) - - threshold_constant : float - Constant subtracted from the mean or weighted mean - (see the details below). Normally, it is positive but may be - zero or negative as well. - (optional, default: -2) - - scale : int - Used to divide the height/width of a pdf to get a structuring - element for image processing. - (optional, default: 15) - - iterations : int - Number of iterations for dilation. - (optional, default: 0) - - invert : bool - Whether or not to invert the image. Useful when pdfs have - tables with lines in background. - (optional, default: False) - - margins : tuple - PDFMiner margins. (char_margin, line_margin, word_margin) - (optional, default: (1.0, 0.5, 0.1)) - - split_text : bool - Whether or not to split a text line if it spans across - different cells. - (optional, default: False) - - flag_size : bool - Whether or not to highlight a substring using - if its size is different from rest of the string, useful for - super and subscripts. - (optional, default: True) - - shift_text : list - {'l', 'r', 't', 'b'} - Select one or more from above and pass them as a list to - specify where the text in a spanning cell should flow. - (optional, default: ['l', 't']) - - debug : string - {'contour', 'line', 'joint', 'table'} - Set to one of the above values to generate a matplotlib plot - of detected contours, lines, joints and the table generated. - (optional, default: None) - """ def __init__(self, table_area=None, fill=None, mtol=[2], jtol=[2], blocksize=15, threshold_constant=-2, scale=15, iterations=0, invert=False, margins=(1.0, 0.5, 0.1), split_text=False, @@ -540,28 +288,6 @@ class Lattice: @staticmethod def _reduce_index(t, idx, shift_text): - """Reduces index of a text object if it lies within a spanning - cell. - - Parameters - ---------- - table : object - camelot.table.Table - - idx : list - List of tuples of the form (r_idx, c_idx, text). - - shift_text : list - {'l', 'r', 't', 'b'} - Select one or more from above and pass them as a list to - specify where the text in a spanning cell should flow. - - Returns - ------- - indices : list - List of tuples of the form (idx, text) where idx is the reduced - index of row/column and text is the an lttextline substring. - """ indices = [] for r_idx, c_idx, text in idx: for d in shift_text: @@ -586,24 +312,6 @@ class Lattice: def _fill_spanning(t, fill=None): - """Fills spanning cells. - - Parameters - ---------- - t : object - camelot.table.Table - - fill : list - {'h', 'v'} - Specify to fill spanning cells in horizontal or vertical - direction. - (optional, default: None) - - Returns - ------- - t : object - camelot.table.Table - """ for f in fill: if f == "h": for i in range(len(t.cells)): @@ -620,17 +328,6 @@ class Lattice: return t def extract_tables(self, pdfname): - """Expects a single page pdf as input with rotation corrected. - - Parameters - ---------- - pdfname : string - Path to single page pdf file. - - Returns - ------- - page : dict - """ layout, dim = get_page_layout(pdfname, char_margin=self.char_margin, line_margin=self.line_margin, word_margin=self.word_margin) lttextlh = get_text_objects(layout, ltype="lh") @@ -700,27 +397,22 @@ class Lattice: g = Geometry() if self.debug: - g.images = [(img, table_bbox)] + g.images = (img, table_bbox) table_bbox, v_segments, h_segments = scale_to_pdf(table_bbox, v_segments, h_segments, factors_pdf) if self.debug: - g.segments = [(v_segments, h_segments)] - _tables = [] + g.segments = (v_segments, h_segments) - page = {} - tables = {} + _tables = [] # sort tables based on y-coord for table_no, k in enumerate(sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True)): # select elements which lie within table_bbox - table_data = {} t_bbox = {} v_s, h_s = segments_bbox(k, v_segments, h_segments) t_bbox['horizontal'] = text_in_bbox(k, lttextlh) t_bbox['vertical'] = text_in_bbox(k, lttextlv) - char_bbox = text_in_bbox(k, ltchar) - table_data['text_p'] = 100 * (1 - (len(char_bbox) / len(ltchar))) for direction in t_bbox: t_bbox[direction].sort(key=lambda x: (-x.y0, x.x0)) cols, rows = zip(*table_bbox[k]) @@ -745,45 +437,36 @@ class Lattice: # set table border edges to True table = table.set_border_edges() - if self.debug: - _tables.append(table) - - assignment_errors = [] - table_data['split_text'] = [] - table_data['superscript'] = [] + pos_errors = [] for direction in ['vertical', 'horizontal']: for t in t_bbox[direction]: indices, error = get_table_index( table, t, direction, split_text=self.split_text, flag_size=self.flag_size) if indices[:2] != (-1, -1): - assignment_errors.append(error) + pos_errors.append(error) indices = self._reduce_index(table, indices, shift_text=self.shift_text) - if len(indices) > 1: - table_data['split_text'].append(indices) for r_idx, c_idx, text in indices: - if all(s in text for s in ['', '']): - table_data['superscript'].append((r_idx, c_idx, text)) table.cells[r_idx][c_idx].add_text(text) - score = get_score([[100, assignment_errors]]) - table_data['score'] = score + accuracy = compute_accuracy([[100, pos_errors]]) if self.fill is not None: table = self._fill_spanning(table, fill=self.fill) - ar = table.get_list() - ar = encode_list(ar) - table_data['data'] = ar - empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar) - table_data['empty_p'] = empty_p - table_data['r_nempty_cells'] = r_nempty_cells - table_data['c_nempty_cells'] = c_nempty_cells - table_data['nrows'] = len(ar) - table_data['ncols'] = len(ar[0]) - tables['table-{0}'.format(table_no + 1)] = table_data - page[os.path.basename(bname)] = tables + + data = table.data + data = encode_(data) + table.df = pd.DataFrame(data) + table.shape = table.df.shape + + whitespace, __, __ = count_empty(data) + table.accuracy = accuracy + table.whitespace = whitespace + table.order = table_no + 1 + table.page = os.path.basename(bname).replace('page-', '') + + _tables.append(table) if self.debug: g.tables = _tables - return [None], [g] - return page \ No newline at end of file + return _tables, g \ No newline at end of file diff --git a/camelot/utils.py b/camelot/utils.py index 3640b37..650e62a 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -557,7 +557,7 @@ def get_table_index(table, t, direction, split_text=False, flag_size=True): return [(r_idx, c_idx, t.get_text().strip('\n'))], error -def get_score(error_weights): +def compute_accuracy(error_weights): """Calculates score based on weights assigned to various parameters, and their error percentages. @@ -648,7 +648,7 @@ def count_empty(d): return empty_p, r_nempty_cells, c_nempty_cells -def encode_list(ar): +def encode_(ar): """Encodes list of text. Parameters