Add properties to Table

pull/2/head
Vinayak Mehta 2018-09-05 18:20:46 +05:30
parent b9d77cb983
commit 9124e3374c
4 changed files with 110 additions and 372 deletions

View File

@ -1,5 +1,4 @@
import numpy as np
import pandas as pd
class Cell(object):
@ -48,9 +47,15 @@ class Table(object):
self.rows = rows
self.cells = [[Cell(c[0], r[1], c[1], r[0])
for c in cols] for r in rows]
self._df = None
self._shape = (0, 0)
self._accuracy = 0
self._whitespace = 0
self._order = None
self._page = None
def __repr__(self):
pass
return '<{} shape={}>'.format(self.__class__.__name__, self._shape)
def set_all_edges(self):
for r in range(len(self.rows)):
@ -216,12 +221,61 @@ class Table(object):
return self
def get_list(self):
ar = []
@property
def data(self):
d = []
for r in range(len(self.rows)):
ar.append([self.cells[r][c].get_text().strip()
d.append([self.cells[r][c].get_text().strip()
for c in range(len(self.cols))])
return ar
return d
@property
def df(self):
return self._df
@df.setter
def df(self, dataframe):
self._df = dataframe
@property
def shape(self):
return self._shape
@shape.setter
def shape(self, s):
self._shape = s
@property
def accuracy(self):
return self._accuracy
@accuracy.setter
def accuracy(self, a):
self._accuracy = a
@property
def whitespace(self):
return self._whitespace
@whitespace.setter
def whitespace(self, w):
self._whitespace = w
@property
def order(self):
return self._order
@order.setter
def order(self, o):
self._order = o
@property
def page(self):
return self._page
@page.setter
def page(self, p):
self._page = p
class TableList(list):
@ -236,8 +290,8 @@ class TableList(list):
class Geometry(object):
def __init__(self):
self._text = []
self._images = []
self._segments = []
self._images = ()
self._segments = ()
self._tables = []
@property

View File

@ -3,6 +3,7 @@ import tempfile
from PyPDF2 import PdfFileReader, PdfFileWriter
from .core import TableList, GeometryList
from .parsers import Stream, Lattice
from .utils import get_page_layout, get_text_objects, get_rotation
@ -80,5 +81,5 @@ class PDFHandler(object):
for p in pages:
t, g = parser.extract_tables(p)
tables.extend(t)
geometry.extend(g)
geometry.append(g)
return TableList(tables), GeometryList(geometry)

View File

@ -9,13 +9,14 @@ import warnings
import subprocess
import numpy as np
import pandas as pd
from .core import Table, Geometry
from .image_processing import (adaptive_threshold, find_lines, find_table_contours,
find_table_joints)
from .utils import (scale_to_pdf, scale_to_image, segments_bbox, text_in_bbox,
merge_close_values, get_table_index, get_score, count_empty,
encode_list, get_text_objects, get_page_layout)
merge_close_values, get_table_index, compute_accuracy, count_empty,
get_text_objects, get_page_layout, encode_)
__all__ = ['Stream', 'Lattice']
@ -31,58 +32,6 @@ copy_reg.pickle(types.MethodType, _reduce_method)
class Stream:
"""Stream looks for spaces between text elements to form a table.
If you want to give columns, ytol or mtol for each table
when specifying multiple table areas, make sure that their length
is equal to the length of table_area. Mapping between them is based
on index.
If you don't want to specify columns for the some tables in a pdf
page having multiple tables, pass them as empty strings.
For example: ['', 'x1,x2,x3,x4', '']
Parameters
----------
table_area : list
List of strings of the form x1,y1,x2,y2 where
(x1, y1) -> left-top and (x2, y2) -> right-bottom in PDFMiner's
coordinate space, denoting table areas to analyze.
(optional, default: None)
columns : list
List of strings where each string is comma-separated values of
x-coordinates in PDFMiner's coordinate space.
(optional, default: None)
ytol : list
List of ints specifying the y-tolerance parameters.
(optional, default: [2])
mtol : list
List of ints specifying the m-tolerance parameters.
(optional, default: [0])
margins : tuple
PDFMiner margins. (char_margin, line_margin, word_margin)
(optional, default: (1.0, 0.5, 0.1))
split_text : bool
Whether or not to split a text line if it spans across
different cells.
(optional, default: False)
flag_size : bool
Whether or not to highlight a substring using <s></s>
if its size is different from rest of the string, useful for
super and subscripts.
(optional, default: True)
debug : bool
Set to True to generate a matplotlib plot of
LTTextLineHorizontals in order to select table_area, columns.
(optional, default: False)
"""
def __init__(self, table_area=None, columns=None, ytol=[2], mtol=[0],
margins=(1.0, 0.5, 0.1), split_text=False, flag_size=True,
debug=False):
@ -99,20 +48,6 @@ class Stream:
@staticmethod
def _text_bbox(t_bbox):
"""Returns bounding box for the text present on a page.
Parameters
----------
t_bbox : dict
Dict with two keys 'horizontal' and 'vertical' with lists of
LTTextLineHorizontals and LTTextLineVerticals respectively.
Returns
-------
text_bbox : tuple
Tuple of the form (x0, y0, x1, y1) in PDFMiner's coordinate
space.
"""
xmin = min([t.x0 for direction in t_bbox for t in t_bbox[direction]])
ymin = min([t.y0 for direction in t_bbox for t in t_bbox[direction]])
xmax = max([t.x1 for direction in t_bbox for t in t_bbox[direction]])
@ -122,23 +57,6 @@ class Stream:
@staticmethod
def _group_rows(text, ytol=2):
"""Groups PDFMiner text objects into rows using their
y-coordinates taking into account some tolerance ytol.
Parameters
----------
text : list
List of PDFMiner text objects.
ytol : int
Tolerance parameter.
(optional, default: 2)
Returns
-------
rows : list
Two-dimensional list of text objects grouped into rows.
"""
row_y = 0
rows = []
temp = []
@ -158,23 +76,6 @@ class Stream:
@staticmethod
def _merge_columns(l, mtol=0):
"""Merges column boundaries if they overlap or lie within some
tolerance mtol.
Parameters
----------
l : list
List of column coordinate tuples.
mtol : int
TODO
(optional, default: 0)
Returns
-------
merged : list
List of merged column coordinate tuples.
"""
merged = []
for higher in l:
if not merged:
@ -203,22 +104,6 @@ class Stream:
@staticmethod
def _join_rows(rows_grouped, text_y_max, text_y_min):
"""Makes row coordinates continuous.
Parameters
----------
rows_grouped : list
Two-dimensional list of text objects grouped into rows.
text_y_max : int
text_y_min : int
Returns
-------
rows : list
List of continuous row coordinate tuples.
"""
row_mids = [sum([(t.y0 + t.y1) / 2 for t in r]) / len(r)
if len(r) > 0 else 0 for r in rows_grouped]
rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))]
@ -230,25 +115,6 @@ class Stream:
@staticmethod
def _add_columns(cols, text, ytol):
"""Adds columns to existing list by taking into account
the text that lies outside the current column coordinates.
Parameters
----------
cols : list
List of column coordinate tuples.
text : list
List of PDFMiner text objects.
ytol : int
Tolerance parameter.
Returns
-------
cols : list
Updated list of column coordinate tuples.
"""
if text:
text = Stream._group_rows(text, ytol=ytol)
elements = [len(r) for r in text]
@ -259,22 +125,6 @@ class Stream:
@staticmethod
def _join_columns(cols, text_x_min, text_x_max):
"""Makes column coordinates continuous.
Parameters
----------
cols : list
List of column coordinate tuples.
text_x_min : int
text_y_max : int
Returns
-------
cols : list
Updated list of column coordinate tuples.
"""
cols = sorted(cols)
cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))]
cols.insert(0, text_x_min)
@ -284,17 +134,6 @@ class Stream:
return cols
def extract_tables(self, pdfname):
"""Expects a single page pdf as input with rotation corrected.
Parameters
---------
pdfname : string
Path to single page pdf file.
Returns
-------
page : dict
"""
layout, dim = get_page_layout(pdfname, char_margin=self.char_margin,
line_margin=self.line_margin, word_margin=self.word_margin)
lttextlh = get_text_objects(layout, ltype="lh")
@ -314,7 +153,6 @@ class Stream:
text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlh])
text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlv])
g.text = text
return [None], [g]
if self.table_area is not None:
if self.columns is not None:
@ -343,17 +181,13 @@ class Stream:
else:
mtolerance = copy.deepcopy(self.mtol)
page = {}
tables = {}
_tables = []
# sort tables based on y-coord
for table_no, k in enumerate(sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True)):
# select elements which lie within table_bbox
table_data = {}
t_bbox = {}
t_bbox['horizontal'] = text_in_bbox(k, lttextlh)
t_bbox['vertical'] = text_in_bbox(k, lttextlv)
char_bbox = text_in_bbox(k, ltchar)
table_data['text_p'] = 100 * (1 - (len(char_bbox) / len(ltchar)))
for direction in t_bbox:
t_bbox[direction].sort(key=lambda x: (-x.y0, x.x0))
text_x_min, text_y_min, text_x_max, text_y_max = self._text_bbox(t_bbox)
@ -399,124 +233,38 @@ class Stream:
table = Table(cols, rows)
table = table.set_all_edges()
assignment_errors = []
table_data['split_text'] = []
table_data['superscript'] = []
pos_errors = []
for direction in t_bbox:
for t in t_bbox[direction]:
indices, error = get_table_index(
table, t, direction, split_text=self.split_text,
flag_size=self.flag_size)
assignment_errors.append(error)
if len(indices) > 1:
table_data['split_text'].append(indices)
for r_idx, c_idx, text in indices:
if all(s in text for s in ['<s>', '</s>']):
table_data['superscript'].append((r_idx, c_idx, text))
table.cells[r_idx][c_idx].add_text(text)
if indices[:2] != (-1, -1):
pos_errors.append(error)
for r_idx, c_idx, text in indices:
table.cells[r_idx][c_idx].add_text(text)
if guess:
score = get_score([[66, assignment_errors], [34, [len_non_mode / len(elements)]]])
accuracy = compute_accuracy([[66, pos_errors], [34, [len_non_mode / len(elements)]]])
else:
score = get_score([[100, assignment_errors]])
accuracy = compute_accuracy([[100, pos_errors]])
table_data['score'] = score
ar = table.get_list()
ar = encode_list(ar)
table_data['data'] = ar
empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)
table_data['empty_p'] = empty_p
table_data['r_nempty_cells'] = r_nempty_cells
table_data['c_nempty_cells'] = c_nempty_cells
table_data['nrows'] = len(ar)
table_data['ncols'] = len(ar[0])
tables['table-{0}'.format(table_no + 1)] = table_data
page[os.path.basename(bname)] = tables
data = table.data
data = encode_(data)
table.df = pd.DataFrame(data)
table.shape = table.df.shape
return page
whitespace, __, __ = count_empty(data)
table.accuracy = accuracy
table.whitespace = whitespace
table.order = table_no + 1
table.page = os.path.basename(bname).replace('page-', '')
_tables.append(table)
return _tables, g
class Lattice:
"""Lattice looks for lines in the pdf to form a table.
If you want to give fill and mtol for each table when specifying
multiple table areas, make sure that the length of fill and mtol
is equal to the length of table_area. Mapping between them is based
on index.
Parameters
----------
table_area : list
List of strings of the form x1,y1,x2,y2 where
(x1, y1) -> left-top and (x2, y2) -> right-bottom in PDFMiner's
coordinate space, denoting table areas to analyze.
(optional, default: None)
fill : list
List of strings specifying directions to fill spanning cells.
{'h', 'v'} to fill spanning cells in horizontal or vertical
direction.
(optional, default: None)
mtol : list
List of ints specifying m-tolerance parameters.
(optional, default: [2])
jtol : list
List of ints specifying j-tolerance parameters.
(optional, default: [2])
blocksize : int
Size of a pixel neighborhood that is used to calculate a
threshold value for the pixel: 3, 5, 7, and so on.
(optional, default: 15)
threshold_constant : float
Constant subtracted from the mean or weighted mean
(see the details below). Normally, it is positive but may be
zero or negative as well.
(optional, default: -2)
scale : int
Used to divide the height/width of a pdf to get a structuring
element for image processing.
(optional, default: 15)
iterations : int
Number of iterations for dilation.
(optional, default: 0)
invert : bool
Whether or not to invert the image. Useful when pdfs have
tables with lines in background.
(optional, default: False)
margins : tuple
PDFMiner margins. (char_margin, line_margin, word_margin)
(optional, default: (1.0, 0.5, 0.1))
split_text : bool
Whether or not to split a text line if it spans across
different cells.
(optional, default: False)
flag_size : bool
Whether or not to highlight a substring using <s></s>
if its size is different from rest of the string, useful for
super and subscripts.
(optional, default: True)
shift_text : list
{'l', 'r', 't', 'b'}
Select one or more from above and pass them as a list to
specify where the text in a spanning cell should flow.
(optional, default: ['l', 't'])
debug : string
{'contour', 'line', 'joint', 'table'}
Set to one of the above values to generate a matplotlib plot
of detected contours, lines, joints and the table generated.
(optional, default: None)
"""
def __init__(self, table_area=None, fill=None, mtol=[2], jtol=[2],
blocksize=15, threshold_constant=-2, scale=15, iterations=0,
invert=False, margins=(1.0, 0.5, 0.1), split_text=False,
@ -540,28 +288,6 @@ class Lattice:
@staticmethod
def _reduce_index(t, idx, shift_text):
"""Reduces index of a text object if it lies within a spanning
cell.
Parameters
----------
table : object
camelot.table.Table
idx : list
List of tuples of the form (r_idx, c_idx, text).
shift_text : list
{'l', 'r', 't', 'b'}
Select one or more from above and pass them as a list to
specify where the text in a spanning cell should flow.
Returns
-------
indices : list
List of tuples of the form (idx, text) where idx is the reduced
index of row/column and text is the an lttextline substring.
"""
indices = []
for r_idx, c_idx, text in idx:
for d in shift_text:
@ -586,24 +312,6 @@ class Lattice:
def _fill_spanning(t, fill=None):
"""Fills spanning cells.
Parameters
----------
t : object
camelot.table.Table
fill : list
{'h', 'v'}
Specify to fill spanning cells in horizontal or vertical
direction.
(optional, default: None)
Returns
-------
t : object
camelot.table.Table
"""
for f in fill:
if f == "h":
for i in range(len(t.cells)):
@ -620,17 +328,6 @@ class Lattice:
return t
def extract_tables(self, pdfname):
"""Expects a single page pdf as input with rotation corrected.
Parameters
----------
pdfname : string
Path to single page pdf file.
Returns
-------
page : dict
"""
layout, dim = get_page_layout(pdfname, char_margin=self.char_margin,
line_margin=self.line_margin, word_margin=self.word_margin)
lttextlh = get_text_objects(layout, ltype="lh")
@ -700,27 +397,22 @@ class Lattice:
g = Geometry()
if self.debug:
g.images = [(img, table_bbox)]
g.images = (img, table_bbox)
table_bbox, v_segments, h_segments = scale_to_pdf(table_bbox, v_segments,
h_segments, factors_pdf)
if self.debug:
g.segments = [(v_segments, h_segments)]
_tables = []
g.segments = (v_segments, h_segments)
page = {}
tables = {}
_tables = []
# sort tables based on y-coord
for table_no, k in enumerate(sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True)):
# select elements which lie within table_bbox
table_data = {}
t_bbox = {}
v_s, h_s = segments_bbox(k, v_segments, h_segments)
t_bbox['horizontal'] = text_in_bbox(k, lttextlh)
t_bbox['vertical'] = text_in_bbox(k, lttextlv)
char_bbox = text_in_bbox(k, ltchar)
table_data['text_p'] = 100 * (1 - (len(char_bbox) / len(ltchar)))
for direction in t_bbox:
t_bbox[direction].sort(key=lambda x: (-x.y0, x.x0))
cols, rows = zip(*table_bbox[k])
@ -745,45 +437,36 @@ class Lattice:
# set table border edges to True
table = table.set_border_edges()
if self.debug:
_tables.append(table)
assignment_errors = []
table_data['split_text'] = []
table_data['superscript'] = []
pos_errors = []
for direction in ['vertical', 'horizontal']:
for t in t_bbox[direction]:
indices, error = get_table_index(
table, t, direction, split_text=self.split_text,
flag_size=self.flag_size)
if indices[:2] != (-1, -1):
assignment_errors.append(error)
pos_errors.append(error)
indices = self._reduce_index(table, indices, shift_text=self.shift_text)
if len(indices) > 1:
table_data['split_text'].append(indices)
for r_idx, c_idx, text in indices:
if all(s in text for s in ['<s>', '</s>']):
table_data['superscript'].append((r_idx, c_idx, text))
table.cells[r_idx][c_idx].add_text(text)
score = get_score([[100, assignment_errors]])
table_data['score'] = score
accuracy = compute_accuracy([[100, pos_errors]])
if self.fill is not None:
table = self._fill_spanning(table, fill=self.fill)
ar = table.get_list()
ar = encode_list(ar)
table_data['data'] = ar
empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)
table_data['empty_p'] = empty_p
table_data['r_nempty_cells'] = r_nempty_cells
table_data['c_nempty_cells'] = c_nempty_cells
table_data['nrows'] = len(ar)
table_data['ncols'] = len(ar[0])
tables['table-{0}'.format(table_no + 1)] = table_data
page[os.path.basename(bname)] = tables
data = table.data
data = encode_(data)
table.df = pd.DataFrame(data)
table.shape = table.df.shape
whitespace, __, __ = count_empty(data)
table.accuracy = accuracy
table.whitespace = whitespace
table.order = table_no + 1
table.page = os.path.basename(bname).replace('page-', '')
_tables.append(table)
if self.debug:
g.tables = _tables
return [None], [g]
return page
return _tables, g

View File

@ -557,7 +557,7 @@ def get_table_index(table, t, direction, split_text=False, flag_size=True):
return [(r_idx, c_idx, t.get_text().strip('\n'))], error
def get_score(error_weights):
def compute_accuracy(error_weights):
"""Calculates score based on weights assigned to various parameters,
and their error percentages.
@ -648,7 +648,7 @@ def count_empty(d):
return empty_p, r_nempty_cells, c_nempty_cells
def encode_list(ar):
def encode_(ar):
"""Encodes list of text.
Parameters