Merge pull request #234 from socialcopsdev/add-060-kwargs

[MRG] Add more configuration parameters
pull/2/head
Vinayak Mehta 2018-12-21 16:56:30 +05:30 committed by GitHub
commit 175ba32d38
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
16 changed files with 310 additions and 120 deletions

View File

@ -43,6 +43,8 @@ pass_config = click.make_pass_decorator(Config)
help='Split text that spans across multiple cells.') help='Split text that spans across multiple cells.')
@click.option('-flag', '--flag_size', is_flag=True, help='Flag text based on' @click.option('-flag', '--flag_size', is_flag=True, help='Flag text based on'
' font size. Useful to detect super/subscripts.') ' font size. Useful to detect super/subscripts.')
@click.option('-strip', '--strip_text', help='Characters that should be stripped from a string before'
' assigning it to a cell.')
@click.option('-M', '--margins', nargs=3, default=(1.0, 0.5, 0.1), @click.option('-M', '--margins', nargs=3, default=(1.0, 0.5, 0.1),
help='PDFMiner char_margin, line_margin and word_margin.') help='PDFMiner char_margin, line_margin and word_margin.')
@click.pass_context @click.pass_context
@ -68,10 +70,10 @@ def cli(ctx, *args, **kwargs):
@click.option('-shift', '--shift_text', default=['l', 't'], @click.option('-shift', '--shift_text', default=['l', 't'],
type=click.Choice(['', 'l', 'r', 't', 'b']), multiple=True, type=click.Choice(['', 'l', 'r', 't', 'b']), multiple=True,
help='Direction in which text in a spanning cell will flow.') help='Direction in which text in a spanning cell will flow.')
@click.option('-l', '--line_close_tol', default=2, @click.option('-l', '--line_tol', default=2,
help='Tolerance parameter used to merge close vertical' help='Tolerance parameter used to merge close vertical'
' and horizontal lines.') ' and horizontal lines.')
@click.option('-j', '--joint_close_tol', default=2, @click.option('-j', '--joint_tol', default=2,
help='Tolerance parameter used to decide whether' help='Tolerance parameter used to decide whether'
' the detected lines and points lie close to each other.') ' the detected lines and points lie close to each other.')
@click.option('-block', '--threshold_blocksize', default=15, @click.option('-block', '--threshold_blocksize', default=15,
@ -84,6 +86,8 @@ def cli(ctx, *args, **kwargs):
' may be zero or negative as well.') ' may be zero or negative as well.')
@click.option('-I', '--iterations', default=0, @click.option('-I', '--iterations', default=0,
help='Number of times for erosion/dilation will be applied.') help='Number of times for erosion/dilation will be applied.')
@click.option('-res', '--resolution', default=300,
help='Resolution used for PDF to PNG conversion.')
@click.option('-plot', '--plot_type', @click.option('-plot', '--plot_type',
type=click.Choice(['text', 'grid', 'contour', 'joint', 'line']), type=click.Choice(['text', 'grid', 'contour', 'joint', 'line']),
help='Plot elements found on PDF page for visual debugging.') help='Plot elements found on PDF page for visual debugging.')
@ -133,9 +137,11 @@ def lattice(c, *args, **kwargs):
' where x1, y1 -> left-top and x2, y2 -> right-bottom.') ' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
@click.option('-C', '--columns', default=[], multiple=True, @click.option('-C', '--columns', default=[], multiple=True,
help='X coordinates of column separators.') help='X coordinates of column separators.')
@click.option('-r', '--row_close_tol', default=2, help='Tolerance parameter' @click.option('-e', '--edge_tol', default=50, help='Tolerance parameter'
' for extending textedges vertically.')
@click.option('-r', '--row_tol', default=2, help='Tolerance parameter'
' used to combine text vertically, to generate rows.') ' used to combine text vertically, to generate rows.')
@click.option('-c', '--col_close_tol', default=0, help='Tolerance parameter' @click.option('-c', '--column_tol', default=0, help='Tolerance parameter'
' used to combine text horizontally, to generate columns.') ' used to combine text horizontally, to generate columns.')
@click.option('-plot', '--plot_type', @click.option('-plot', '--plot_type',
type=click.Choice(['text', 'grid', 'contour', 'textedge']), type=click.Choice(['text', 'grid', 'contour', 'textedge']),

View File

@ -13,8 +13,6 @@ import pandas as pd
# minimum number of vertical textline intersections for a textedge # minimum number of vertical textline intersections for a textedge
# to be considered valid # to be considered valid
TEXTEDGE_REQUIRED_ELEMENTS = 4 TEXTEDGE_REQUIRED_ELEMENTS = 4
# y coordinate tolerance for extending textedge
TEXTEDGE_EXTEND_TOLERANCE = 50
# padding added to table area on the left, right and bottom # padding added to table area on the left, right and bottom
TABLE_AREA_PADDING = 10 TABLE_AREA_PADDING = 10
@ -55,11 +53,11 @@ class TextEdge(object):
return '<TextEdge x={} y0={} y1={} align={} valid={}>'.format( return '<TextEdge x={} y0={} y1={} align={} valid={}>'.format(
round(self.x, 2), round(self.y0, 2), round(self.y1, 2), self.align, self.is_valid) round(self.x, 2), round(self.y0, 2), round(self.y1, 2), self.align, self.is_valid)
def update_coords(self, x, y0): def update_coords(self, x, y0, edge_tol=50):
"""Updates the text edge's x and bottom y coordinates and sets """Updates the text edge's x and bottom y coordinates and sets
the is_valid attribute. the is_valid attribute.
""" """
if np.isclose(self.y0, y0, atol=TEXTEDGE_EXTEND_TOLERANCE): if np.isclose(self.y0, y0, atol=edge_tol):
self.x = (self.intersections * self.x + x) / float(self.intersections + 1) self.x = (self.intersections * self.x + x) / float(self.intersections + 1)
self.y0 = y0 self.y0 = y0
self.intersections += 1 self.intersections += 1
@ -74,7 +72,8 @@ class TextEdges(object):
the PDF page. The dict has three keys based on the alignments, the PDF page. The dict has three keys based on the alignments,
and each key's value is a list of camelot.core.TextEdge objects. and each key's value is a list of camelot.core.TextEdge objects.
""" """
def __init__(self): def __init__(self, edge_tol=50):
self.edge_tol = edge_tol
self._textedges = {'left': [], 'right': [], 'middle': []} self._textedges = {'left': [], 'right': [], 'middle': []}
@staticmethod @staticmethod
@ -115,7 +114,8 @@ class TextEdges(object):
if idx is None: if idx is None:
self.add(textline, align) self.add(textline, align)
else: else:
self._textedges[align][idx].update_coords(x_coord, textline.y0) self._textedges[align][idx].update_coords(
x_coord, textline.y0, edge_tol=self.edge_tol)
def generate(self, textlines): def generate(self, textlines):
"""Generates the text edges dict based on horizontal text """Generates the text edges dict based on horizontal text
@ -359,7 +359,7 @@ class Table(object):
cell.left = cell.right = cell.top = cell.bottom = True cell.left = cell.right = cell.top = cell.bottom = True
return self return self
def set_edges(self, vertical, horizontal, joint_close_tol=2): def set_edges(self, vertical, horizontal, joint_tol=2):
"""Sets a cell's edges to True depending on whether the cell's """Sets a cell's edges to True depending on whether the cell's
coordinates overlap with the line's coordinates within a coordinates overlap with the line's coordinates within a
tolerance. tolerance.
@ -376,11 +376,11 @@ class Table(object):
# find closest x coord # find closest x coord
# iterate over y coords and find closest start and end points # iterate over y coords and find closest start and end points
i = [i for i, t in enumerate(self.cols) i = [i for i, t in enumerate(self.cols)
if np.isclose(v[0], t[0], atol=joint_close_tol)] if np.isclose(v[0], t[0], atol=joint_tol)]
j = [j for j, t in enumerate(self.rows) j = [j for j, t in enumerate(self.rows)
if np.isclose(v[3], t[0], atol=joint_close_tol)] if np.isclose(v[3], t[0], atol=joint_tol)]
k = [k for k, t in enumerate(self.rows) k = [k for k, t in enumerate(self.rows)
if np.isclose(v[1], t[0], atol=joint_close_tol)] if np.isclose(v[1], t[0], atol=joint_tol)]
if not j: if not j:
continue continue
J = j[0] J = j[0]
@ -427,11 +427,11 @@ class Table(object):
# find closest y coord # find closest y coord
# iterate over x coords and find closest start and end points # iterate over x coords and find closest start and end points
i = [i for i, t in enumerate(self.rows) i = [i for i, t in enumerate(self.rows)
if np.isclose(h[1], t[0], atol=joint_close_tol)] if np.isclose(h[1], t[0], atol=joint_tol)]
j = [j for j, t in enumerate(self.cols) j = [j for j, t in enumerate(self.cols)
if np.isclose(h[0], t[0], atol=joint_close_tol)] if np.isclose(h[0], t[0], atol=joint_tol)]
k = [k for k, t in enumerate(self.cols) k = [k for k, t in enumerate(self.cols)
if np.isclose(h[2], t[0], atol=joint_close_tol)] if np.isclose(h[2], t[0], atol=joint_tol)]
if not j: if not j:
continue continue
J = j[0] J = j[0]

View File

@ -40,10 +40,13 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice',
flag_size : bool, optional (default: False) flag_size : bool, optional (default: False)
Flag text based on font size. Useful to detect Flag text based on font size. Useful to detect
super/subscripts. Adds <s></s> around flagged text. super/subscripts. Adds <s></s> around flagged text.
row_close_tol^ : int, optional (default: 2) strip_text : str, optional (default: '')
Characters that should be stripped from a string before
assigning it to a cell.
row_tol^ : int, optional (default: 2)
Tolerance parameter used to combine text vertically, Tolerance parameter used to combine text vertically,
to generate rows. to generate rows.
col_close_tol^ : int, optional (default: 0) column_tol^ : int, optional (default: 0)
Tolerance parameter used to combine text horizontally, Tolerance parameter used to combine text horizontally,
to generate columns. to generate columns.
process_background* : bool, optional (default: False) process_background* : bool, optional (default: False)
@ -59,10 +62,10 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice',
shift_text* : list, optional (default: ['l', 't']) shift_text* : list, optional (default: ['l', 't'])
{'l', 'r', 't', 'b'} {'l', 'r', 't', 'b'}
Direction in which text in a spanning cell will flow. Direction in which text in a spanning cell will flow.
line_close_tol* : int, optional (default: 2) line_tol* : int, optional (default: 2)
Tolerance parameter used to merge close vertical and horizontal Tolerance parameter used to merge close vertical and horizontal
lines. lines.
joint_close_tol* : int, optional (default: 2) joint_tol* : int, optional (default: 2)
Tolerance parameter used to decide whether the detected lines Tolerance parameter used to decide whether the detected lines
and points lie close to each other. and points lie close to each other.
threshold_blocksize* : int, optional (default: 15) threshold_blocksize* : int, optional (default: 15)
@ -79,6 +82,8 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice',
Number of times for erosion/dilation is applied. Number of times for erosion/dilation is applied.
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_. For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
resolution* : int, optional (default: 300)
Resolution used for PDF to PNG conversion.
Returns Returns
------- -------

View File

@ -50,10 +50,13 @@ class Lattice(BaseParser):
flag_size : bool, optional (default: False) flag_size : bool, optional (default: False)
Flag text based on font size. Useful to detect Flag text based on font size. Useful to detect
super/subscripts. Adds <s></s> around flagged text. super/subscripts. Adds <s></s> around flagged text.
line_close_tol : int, optional (default: 2) strip_text : str, optional (default: '')
Characters that should be stripped from a string before
assigning it to a cell.
line_tol : int, optional (default: 2)
Tolerance parameter used to merge close vertical and horizontal Tolerance parameter used to merge close vertical and horizontal
lines. lines.
joint_close_tol : int, optional (default: 2) joint_tol : int, optional (default: 2)
Tolerance parameter used to decide whether the detected lines Tolerance parameter used to decide whether the detected lines
and points lie close to each other. and points lie close to each other.
threshold_blocksize : int, optional (default: 15) threshold_blocksize : int, optional (default: 15)
@ -70,13 +73,15 @@ class Lattice(BaseParser):
Number of times for erosion/dilation is applied. Number of times for erosion/dilation is applied.
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_. For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
resolution : int, optional (default: 300)
Resolution used for PDF to PNG conversion.
""" """
def __init__(self, table_areas=None, process_background=False, def __init__(self, table_areas=None, process_background=False,
line_size_scaling=15, copy_text=None, shift_text=['l', 't'], line_size_scaling=15, copy_text=None, shift_text=['l', 't'],
split_text=False, flag_size=False, line_close_tol=2, split_text=False, flag_size=False, strip_text='', line_tol=2,
joint_close_tol=2, threshold_blocksize=15, threshold_constant=-2, joint_tol=2, threshold_blocksize=15, threshold_constant=-2,
iterations=0, **kwargs): iterations=0, resolution=300, **kwargs):
self.table_areas = table_areas self.table_areas = table_areas
self.process_background = process_background self.process_background = process_background
self.line_size_scaling = line_size_scaling self.line_size_scaling = line_size_scaling
@ -84,11 +89,13 @@ class Lattice(BaseParser):
self.shift_text = shift_text self.shift_text = shift_text
self.split_text = split_text self.split_text = split_text
self.flag_size = flag_size self.flag_size = flag_size
self.line_close_tol = line_close_tol self.strip_text = strip_text
self.joint_close_tol = joint_close_tol self.line_tol = line_tol
self.joint_tol = joint_tol
self.threshold_blocksize = threshold_blocksize self.threshold_blocksize = threshold_blocksize
self.threshold_constant = threshold_constant self.threshold_constant = threshold_constant
self.iterations = iterations self.iterations = iterations
self.resolution = resolution
@staticmethod @staticmethod
def _reduce_index(t, idx, shift_text): def _reduce_index(t, idx, shift_text):
@ -209,7 +216,7 @@ class Lattice(BaseParser):
'-sDEVICE=png16m', '-sDEVICE=png16m',
'-o', '-o',
self.imagename, self.imagename,
'-r600', '-r{}'.format(self.resolution),
self.filename self.filename
] ]
gs = get_executable() gs = get_executable()
@ -278,9 +285,9 @@ class Lattice(BaseParser):
rows.extend([tk[1], tk[3]]) rows.extend([tk[1], tk[3]])
# sort horizontal and vertical segments # sort horizontal and vertical segments
cols = merge_close_lines( cols = merge_close_lines(
sorted(cols), line_close_tol=self.line_close_tol) sorted(cols), line_tol=self.line_tol)
rows = merge_close_lines( rows = merge_close_lines(
sorted(rows, reverse=True), line_close_tol=self.line_close_tol) sorted(rows, reverse=True), line_tol=self.line_tol)
# make grid using x and y coord of shortlisted rows and cols # make grid using x and y coord of shortlisted rows and cols
cols = [(cols[i], cols[i + 1]) cols = [(cols[i], cols[i + 1])
for i in range(0, len(cols) - 1)] for i in range(0, len(cols) - 1)]
@ -297,7 +304,7 @@ class Lattice(BaseParser):
table = Table(cols, rows) table = Table(cols, rows)
# set table edges to True using ver+hor lines # set table edges to True using ver+hor lines
table = table.set_edges(v_s, h_s, joint_close_tol=self.joint_close_tol) table = table.set_edges(v_s, h_s, joint_tol=self.joint_tol)
# set table border edges to True # set table border edges to True
table = table.set_border() table = table.set_border()
# set spanning cells to True # set spanning cells to True
@ -310,7 +317,7 @@ class Lattice(BaseParser):
for t in self.t_bbox[direction]: for t in self.t_bbox[direction]:
indices, error = get_table_index( indices, error = get_table_index(
table, t, direction, split_text=self.split_text, table, t, direction, split_text=self.split_text,
flag_size=self.flag_size) flag_size=self.flag_size, strip_text=self.strip_text)
if indices[:2] != (-1, -1): if indices[:2] != (-1, -1):
pos_errors.append(error) pos_errors.append(error)
indices = Lattice._reduce_index(table, indices, shift_text=self.shift_text) indices = Lattice._reduce_index(table, indices, shift_text=self.shift_text)

View File

@ -38,23 +38,31 @@ class Stream(BaseParser):
flag_size : bool, optional (default: False) flag_size : bool, optional (default: False)
Flag text based on font size. Useful to detect Flag text based on font size. Useful to detect
super/subscripts. Adds <s></s> around flagged text. super/subscripts. Adds <s></s> around flagged text.
row_close_tol : int, optional (default: 2) strip_text : str, optional (default: '')
Characters that should be stripped from a string before
assigning it to a cell.
edge_tol : int, optional (default: 50)
Tolerance parameter for extending textedges vertically.
row_tol : int, optional (default: 2)
Tolerance parameter used to combine text vertically, Tolerance parameter used to combine text vertically,
to generate rows. to generate rows.
col_close_tol : int, optional (default: 0) column_tol : int, optional (default: 0)
Tolerance parameter used to combine text horizontally, Tolerance parameter used to combine text horizontally,
to generate columns. to generate columns.
""" """
def __init__(self, table_areas=None, columns=None, split_text=False, def __init__(self, table_areas=None, columns=None, split_text=False,
flag_size=False, row_close_tol=2, col_close_tol=0, **kwargs): flag_size=False, strip_text='', edge_tol=50, row_tol=2,
column_tol=0, **kwargs):
self.table_areas = table_areas self.table_areas = table_areas
self.columns = columns self.columns = columns
self._validate_columns() self._validate_columns()
self.split_text = split_text self.split_text = split_text
self.flag_size = flag_size self.flag_size = flag_size
self.row_close_tol = row_close_tol self.strip_text = strip_text
self.col_close_tol = col_close_tol self.edge_tol = edge_tol
self.row_tol = row_tol
self.column_tol = column_tol
@staticmethod @staticmethod
def _text_bbox(t_bbox): def _text_bbox(t_bbox):
@ -80,7 +88,7 @@ class Stream(BaseParser):
return text_bbox return text_bbox
@staticmethod @staticmethod
def _group_rows(text, row_close_tol=2): def _group_rows(text, row_tol=2):
"""Groups PDFMiner text objects into rows vertically """Groups PDFMiner text objects into rows vertically
within a tolerance. within a tolerance.
@ -88,7 +96,7 @@ class Stream(BaseParser):
---------- ----------
text : list text : list
List of PDFMiner text objects. List of PDFMiner text objects.
row_close_tol : int, optional (default: 2) row_tol : int, optional (default: 2)
Returns Returns
------- -------
@ -104,7 +112,7 @@ class Stream(BaseParser):
# if t.get_text().strip() and all([obj.upright for obj in t._objs if # if t.get_text().strip() and all([obj.upright for obj in t._objs if
# type(obj) is LTChar]): # type(obj) is LTChar]):
if t.get_text().strip(): if t.get_text().strip():
if not np.isclose(row_y, t.y0, atol=row_close_tol): if not np.isclose(row_y, t.y0, atol=row_tol):
rows.append(sorted(temp, key=lambda t: t.x0)) rows.append(sorted(temp, key=lambda t: t.x0))
temp = [] temp = []
row_y = t.y0 row_y = t.y0
@ -114,7 +122,7 @@ class Stream(BaseParser):
return rows return rows
@staticmethod @staticmethod
def _merge_columns(l, col_close_tol=0): def _merge_columns(l, column_tol=0):
"""Merges column boundaries horizontally if they overlap """Merges column boundaries horizontally if they overlap
or lie within a tolerance. or lie within a tolerance.
@ -122,7 +130,7 @@ class Stream(BaseParser):
---------- ----------
l : list l : list
List of column x-coordinate tuples. List of column x-coordinate tuples.
col_close_tol : int, optional (default: 0) column_tol : int, optional (default: 0)
Returns Returns
------- -------
@ -136,17 +144,17 @@ class Stream(BaseParser):
merged.append(higher) merged.append(higher)
else: else:
lower = merged[-1] lower = merged[-1]
if col_close_tol >= 0: if column_tol >= 0:
if (higher[0] <= lower[1] or if (higher[0] <= lower[1] or
np.isclose(higher[0], lower[1], atol=col_close_tol)): np.isclose(higher[0], lower[1], atol=column_tol)):
upper_bound = max(lower[1], higher[1]) upper_bound = max(lower[1], higher[1])
lower_bound = min(lower[0], higher[0]) lower_bound = min(lower[0], higher[0])
merged[-1] = (lower_bound, upper_bound) merged[-1] = (lower_bound, upper_bound)
else: else:
merged.append(higher) merged.append(higher)
elif col_close_tol < 0: elif column_tol < 0:
if higher[0] <= lower[1]: if higher[0] <= lower[1]:
if np.isclose(higher[0], lower[1], atol=abs(col_close_tol)): if np.isclose(higher[0], lower[1], atol=abs(column_tol)):
merged.append(higher) merged.append(higher)
else: else:
upper_bound = max(lower[1], higher[1]) upper_bound = max(lower[1], higher[1])
@ -183,7 +191,7 @@ class Stream(BaseParser):
return rows return rows
@staticmethod @staticmethod
def _add_columns(cols, text, row_close_tol): def _add_columns(cols, text, row_tol):
"""Adds columns to existing list by taking into account """Adds columns to existing list by taking into account
the text that lies outside the current column x-coordinates. the text that lies outside the current column x-coordinates.
@ -202,7 +210,7 @@ class Stream(BaseParser):
""" """
if text: if text:
text = Stream._group_rows(text, row_close_tol=row_close_tol) text = Stream._group_rows(text, row_tol=row_tol)
elements = [len(r) for r in text] elements = [len(r) for r in text]
new_cols = [(t.x0, t.x1) new_cols = [(t.x0, t.x1)
for r in text if len(r) == max(elements) for t in r] for r in text if len(r) == max(elements) for t in r]
@ -248,11 +256,10 @@ class Stream(BaseParser):
Assumes that tables are situated relatively far apart Assumes that tables are situated relatively far apart
vertically. vertically.
""" """
# TODO: add support for arabic text #141 # TODO: add support for arabic text #141
# sort textlines in reading order # sort textlines in reading order
textlines.sort(key=lambda x: (-x.y0, x.x0)) textlines.sort(key=lambda x: (-x.y0, x.x0))
textedges = TextEdges() textedges = TextEdges(edge_tol=self.edge_tol)
# generate left, middle and right textedges # generate left, middle and right textedges
textedges.generate(textlines) textedges.generate(textlines)
# select relevant edges # select relevant edges
@ -294,7 +301,7 @@ class Stream(BaseParser):
self.t_bbox = t_bbox self.t_bbox = t_bbox
text_x_min, text_y_min, text_x_max, text_y_max = self._text_bbox(self.t_bbox) text_x_min, text_y_min, text_x_max, text_y_max = self._text_bbox(self.t_bbox)
rows_grouped = self._group_rows(self.t_bbox['horizontal'], row_close_tol=self.row_close_tol) rows_grouped = self._group_rows(self.t_bbox['horizontal'], row_tol=self.row_tol)
rows = self._join_rows(rows_grouped, text_y_max, text_y_min) rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
elements = [len(r) for r in rows_grouped] elements = [len(r) for r in rows_grouped]
@ -325,7 +332,7 @@ class Stream(BaseParser):
warnings.warn("No tables found in table area {}".format( warnings.warn("No tables found in table area {}".format(
table_idx + 1)) table_idx + 1))
cols = [(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r] cols = [(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r]
cols = self._merge_columns(sorted(cols), col_close_tol=self.col_close_tol) cols = self._merge_columns(sorted(cols), column_tol=self.column_tol)
inner_text = [] inner_text = []
for i in range(1, len(cols)): for i in range(1, len(cols)):
left = cols[i - 1][1] left = cols[i - 1][1]
@ -337,7 +344,7 @@ class Stream(BaseParser):
for t in self.t_bbox[direction] for t in self.t_bbox[direction]
if t.x0 > cols[-1][1] or t.x1 < cols[0][0]] if t.x0 > cols[-1][1] or t.x1 < cols[0][0]]
inner_text.extend(outer_text) inner_text.extend(outer_text)
cols = self._add_columns(cols, inner_text, self.row_close_tol) cols = self._add_columns(cols, inner_text, self.row_tol)
cols = self._join_columns(cols, text_x_min, text_x_max) cols = self._join_columns(cols, text_x_min, text_x_max)
return cols, rows return cols, rows
@ -353,7 +360,7 @@ class Stream(BaseParser):
for t in self.t_bbox[direction]: for t in self.t_bbox[direction]:
indices, error = get_table_index( indices, error = get_table_index(
table, t, direction, split_text=self.split_text, table, t, direction, split_text=self.split_text,
flag_size=self.flag_size) flag_size=self.flag_size, strip_text=self.strip_text)
if indices[:2] != (-1, -1): if indices[:2] != (-1, -1):
pos_errors.append(error) pos_errors.append(error)
for r_idx, c_idx, text in indices: for r_idx, c_idx, text in indices:

View File

@ -20,16 +20,16 @@ from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal,
stream_kwargs = [ stream_kwargs = [
'columns', 'columns',
'row_close_tol', 'row_tol',
'col_close_tol' 'column_tol'
] ]
lattice_kwargs = [ lattice_kwargs = [
'process_background', 'process_background',
'line_size_scaling', 'line_size_scaling',
'copy_text', 'copy_text',
'shift_text', 'shift_text',
'line_close_tol', 'line_tol',
'joint_close_tol', 'joint_tol',
'threshold_blocksize', 'threshold_blocksize',
'threshold_constant', 'threshold_constant',
'iterations' 'iterations'
@ -281,14 +281,14 @@ def text_in_bbox(bbox, text):
return t_bbox return t_bbox
def merge_close_lines(ar, line_close_tol=2): def merge_close_lines(ar, line_tol=2):
"""Merges lines which are within a tolerance by calculating a """Merges lines which are within a tolerance by calculating a
moving mean, based on their x or y axis projections. moving mean, based on their x or y axis projections.
Parameters Parameters
---------- ----------
ar : list ar : list
line_close_tol : int, optional (default: 2) line_tol : int, optional (default: 2)
Returns Returns
------- -------
@ -301,7 +301,7 @@ def merge_close_lines(ar, line_close_tol=2):
ret.append(a) ret.append(a)
else: else:
temp = ret[-1] temp = ret[-1]
if np.isclose(temp, a, atol=line_close_tol): if np.isclose(temp, a, atol=line_tol):
temp = (temp + a) / 2.0 temp = (temp + a) / 2.0
ret[-1] = temp ret[-1] = temp
else: else:
@ -309,7 +309,12 @@ def merge_close_lines(ar, line_close_tol=2):
return ret return ret
def flag_font_size(textline, direction): # TODO: combine the following functions into a TextProcessor class which
# applies corresponding transformations sequentially
# (inspired from sklearn.pipeline.Pipeline)
def flag_font_size(textline, direction, strip_text=''):
"""Flags super/subscripts in text by enclosing them with <s></s>. """Flags super/subscripts in text by enclosing them with <s></s>.
May give false positives. May give false positives.
@ -319,6 +324,9 @@ def flag_font_size(textline, direction):
List of PDFMiner LTChar objects. List of PDFMiner LTChar objects.
direction : string direction : string
Direction of the PDFMiner LTTextLine object. Direction of the PDFMiner LTTextLine object.
strip_text : str, optional (default: '')
Characters that should be stripped from a string before
assigning it to a cell.
Returns Returns
------- -------
@ -344,13 +352,13 @@ def flag_font_size(textline, direction):
fchars = [t[0] for t in chars] fchars = [t[0] for t in chars]
if ''.join(fchars).strip(): if ''.join(fchars).strip():
flist.append(''.join(fchars)) flist.append(''.join(fchars))
fstring = ''.join(flist) fstring = ''.join(flist).strip(strip_text)
else: else:
fstring = ''.join([t.get_text() for t in textline]) fstring = ''.join([t.get_text() for t in textline]).strip(strip_text)
return fstring return fstring
def split_textline(table, textline, direction, flag_size=False): def split_textline(table, textline, direction, flag_size=False, strip_text=''):
"""Splits PDFMiner LTTextLine into substrings if it spans across """Splits PDFMiner LTTextLine into substrings if it spans across
multiple rows/columns. multiple rows/columns.
@ -365,6 +373,9 @@ def split_textline(table, textline, direction, flag_size=False):
Whether or not to highlight a substring using <s></s> Whether or not to highlight a substring using <s></s>
if its size is different from rest of the string. (Useful for if its size is different from rest of the string. (Useful for
super and subscripts.) super and subscripts.)
strip_text : str, optional (default: '')
Characters that should be stripped from a string before
assigning it to a cell.
Returns Returns
------- -------
@ -416,14 +427,15 @@ def split_textline(table, textline, direction, flag_size=False):
grouped_chars = [] grouped_chars = []
for key, chars in groupby(cut_text, itemgetter(0, 1)): for key, chars in groupby(cut_text, itemgetter(0, 1)):
if flag_size: if flag_size:
grouped_chars.append((key[0], key[1], flag_font_size([t[2] for t in chars], direction))) grouped_chars.append((key[0], key[1],
flag_font_size([t[2] for t in chars], direction, strip_text=strip_text)))
else: else:
gchars = [t[2].get_text() for t in chars] gchars = [t[2].get_text() for t in chars]
grouped_chars.append((key[0], key[1], ''.join(gchars))) grouped_chars.append((key[0], key[1], ''.join(gchars).strip(strip_text)))
return grouped_chars return grouped_chars
def get_table_index(table, t, direction, split_text=False, flag_size=False): def get_table_index(table, t, direction, split_text=False, flag_size=False, strip_text='',):
"""Gets indices of the table cell where given text object lies by """Gets indices of the table cell where given text object lies by
comparing their y and x-coordinates. comparing their y and x-coordinates.
@ -441,6 +453,9 @@ def get_table_index(table, t, direction, split_text=False, flag_size=False):
Whether or not to highlight a substring using <s></s> Whether or not to highlight a substring using <s></s>
if its size is different from rest of the string. (Useful for if its size is different from rest of the string. (Useful for
super and subscripts) super and subscripts)
strip_text : str, optional (default: '')
Characters that should be stripped from a string before
assigning it to a cell.
Returns Returns
------- -------
@ -495,12 +510,12 @@ def get_table_index(table, t, direction, split_text=False, flag_size=False):
error = ((X * (y0_offset + y1_offset)) + (Y * (x0_offset + x1_offset))) / charea error = ((X * (y0_offset + y1_offset)) + (Y * (x0_offset + x1_offset))) / charea
if split_text: if split_text:
return split_textline(table, t, direction, flag_size=flag_size), error return split_textline(table, t, direction, flag_size=flag_size, strip_text=strip_text), error
else: else:
if flag_size: if flag_size:
return [(r_idx, c_idx, flag_font_size(t._objs, direction))], error return [(r_idx, c_idx, flag_font_size(t._objs, direction, strip_text=strip_text))], error
else: else:
return [(r_idx, c_idx, t.get_text())], error return [(r_idx, c_idx, t.get_text().strip(strip_text))], error
def compute_accuracy(error_weights): def compute_accuracy(error_weights):

BIN
docs/_static/png/edge_tol_1.png vendored 100644

Binary file not shown.

After

Width:  |  Height:  |  Size: 20 KiB

BIN
docs/_static/png/edge_tol_2.png vendored 100644

Binary file not shown.

After

Width:  |  Height:  |  Size: 20 KiB

View File

@ -316,10 +316,87 @@ You can solve this by passing ``flag_size=True``, which will enclose the supersc
"Madhya Pradesh","27.13","23.57","-","-","3.56","0.38","-","1.86","-","1.28" "Madhya Pradesh","27.13","23.57","-","-","3.56","0.38","-","1.86","-","1.28"
"...","...","...","...","...","...","...","...","...","...","..." "...","...","...","...","...","...","...","...","...","...","..."
Control how text is grouped into rows Strip characters from text
------------------------------------- --------------------------
You can pass ``row_close_tol=<+int>`` to group the rows closer together, as shown below. You can strip unwanted characters like spaces, dots and newlines from a string using the ``strip_text`` keyword argument. Take a look at `this PDF <https://github.com/socialcopsdev/camelot/blob/master/tests/files/tabula/12s0324.pdf>`_ as an example, the text at the start of each row contains a lot of unwanted spaces, dots and newlines.
::
>>> tables = camelot.read_pdf('12s0324.pdf', flavor='stream', strip_text=' .\n')
>>> tables[0].df
.. tip::
Here's how you can do the same with the :ref:`command-line interface <cli>`.
::
$ camelot -strip ' .\n' stream 12s0324.pdf
.. csv-table::
"...","...","...","...","...","...","...","...","...","..."
"Forcible rape","17.5","2.6","14.9","17.2","2.5","14.7","","",""
"Robbery","102.1","25.5","76.6","90.0","22.9","67.1","12.1","2.5","9.5"
"Aggravated assault","338.4","40.1","298.3","264.0","30.2","233.8","74.4","9.9","64.5"
"Property crime","1,396 .4","338 .7","1,057 .7","875 .9","210 .8","665 .1","608 .2","127 .9","392 .6"
"Burglary","240.9","60.3","180.6","205.0","53.4","151.7","35.9","6.9","29.0"
"...","...","...","...","...","...","...","...","...","..."
Improve guessed table areas
---------------------------
While using :ref:`Stream <stream>`, automatic table detection can fail for PDFs like `this one <https://github.com/socialcopsdev/camelot/blob/master/tests/files/edge_tol.pdf>`_. That's because the text is relatively far apart vertically, which can lead to shorter textedges being calculated.
.. note:: To know more about how textedges are calculated to guess table areas, you can see pages 20, 35 and 40 of `Anssi Nurminen's master's thesis <http://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3>`_.
Let's see the table area that is detected by default.
::
>>> tables = camelot.read_pdf('edge_tol.pdf', flavor='stream')
>>> camelot.plot(tables[0], kind='contour')
>>> plt.show()
.. tip::
Here's how you can do the same with the :ref:`command-line interface <cli>`.
::
$ camelot stream -plot contour edge.pdf
.. figure:: ../_static/png/edge_tol_1.png
:height: 674
:width: 1366
:scale: 50%
:alt: Table area with default edge_tol
:align: left
To improve the detected area, you can increase the ``edge_tol`` (default: 50) value to counter the effect of text being placed relatively far apart vertically. Larger ``edge_tol`` will lead to longer textedges being detected, leading to an improved guess of the table area. Let's use a value of 500.
::
>>> tables = camelot.read_pdf('edge_tol.pdf', flavor='stream', edge_tol=500)
>>> camelot.plot(tables[0], kind='contour')
>>> plt.show()
.. tip::
Here's how you can do the same with the :ref:`command-line interface <cli>`.
::
$ camelot stream -e 500 -plot contour edge.pdf
.. figure:: ../_static/png/edge_tol_2.png
:height: 674
:width: 1366
:scale: 50%
:alt: Table area with default edge_tol
:align: left
As you can see, the guessed table area has improved!
Improve guessed table rows
--------------------------
You can pass ``row_tol=<+int>`` to group the rows closer together, as shown below.
:: ::
@ -337,7 +414,7 @@ You can pass ``row_close_tol=<+int>`` to group the rows closer together, as show
:: ::
>>> tables = camelot.read_pdf('group_rows.pdf', flavor='stream', row_close_tol=10) >>> tables = camelot.read_pdf('group_rows.pdf', flavor='stream', row_tol=10)
>>> tables[0].df >>> tables[0].df
.. tip:: .. tip::

View File

@ -312,6 +312,63 @@ data_stream_flag_size = [
["ALL STATES", "513.38", "436.02", "-", "25.57", "51.06", "14.18", "-", "8.21", "11.83", "11.08"] ["ALL STATES", "513.38", "436.02", "-", "25.57", "51.06", "14.18", "-", "8.21", "11.83", "11.08"]
] ]
data_stream_strip_text = [
["V i n s a u Ve r r e", ""],
["Les Blancs", "12.5CL"],
["A.O.P Côtes du Rhône", ""],
["Domaine de la Guicharde « Autour de la chapelle » 2016", "8 €"],
["A.O.P Vacqueyras", ""],
["Domaine de Montvac « Melodine » 2016", "10 €"],
["A.O.P Châteauneuf du Pape", ""],
["Domaine de Beaurenard 2017", "13 €"],
["A.O.P Côteaux du Languedoc", ""],
["Villa Tempora « Un temps pour elle » 2014", "9 €"],
["A.O.P Côtes de Provence", ""],
["Château Grand Boise 2017", "9 €"],
["Les Rosés", "12,5 CL"],
["A.O.P Côtes du Rhône", ""],
["Domaine de la Florane « A fleur de Pampre » 2016", "8 €"],
["Famille Coulon (Domaine Beaurenard) Biotifulfox 2017", "8 €"],
["A.O.P Vacqueyras", ""],
["Domaine de Montvac 2017", "9 €"],
["A.O.P Languedoc", ""],
["Domaine de Joncas « Nébla » 2015", "8 €"],
["Villa Tempora « Larroseur arrosé » 2015", "9 €"],
["A.O.P Côtes de Provence", ""],
["Château Grand Boise « Sainte Victoire » 2017", "9 €"],
["Château Léoube 2016", "10 €"]
]
data_stream_edge_tol = [
["Key figures", ""],
["", "2016"],
["(all amounts in EUR)", ""],
["C\nlass A", ""],
["N\net Asset Value at 31 December", "5,111,372"],
["N\number of outstanding units at 31 December", "49,136"],
["N\net Asset Value per unit at 31 December", "104.03"],
["C\nlass B", ""],
["N\net Asset Value at 31 December", "49,144,825"],
["N\number of outstanding units at 31 December", "471,555"],
["N\net Asset Value per unit at 31 December", "104.22"],
["T\notal for the Fund", ""],
["N\net Asset Value at 31 December", "54,256,197"],
["N\number of outstanding units at 31 December", "520,691"],
["I\nnvestment result", ""],
["Direct result", "-"],
["Revaluation", "2,076,667"],
["Costs", "(106,870)"],
["T\notal investment result for the period1", "1,969,797"],
["I\nnvestment result per unit2", ""],
["Direct result", "-"],
["Revaluation", "3.99"],
["Costs", "(0.21)"],
["T\notal investment result per unit", "3.78"],
["1 The results cover the period from inception of the Fund at 8 April 2016 through 31 December 2016.", ""],
["2 The result per unit is calculated using the total number of outstanding unit as per the end of the", ""],
["period.", ""]
]
data_lattice = [ data_lattice = [
["Cycle \nName", "KI \n(1/km)", "Distance \n(mi)", "Percent Fuel Savings", "", "", ""], ["Cycle \nName", "KI \n(1/km)", "Distance \n(mi)", "Percent Fuel Savings", "", "", ""],
["", "", "", "Improved \nSpeed", "Decreased \nAccel", "Eliminate \nStops", "Decreased \nIdle"], ["", "", "", "Improved \nSpeed", "Decreased \nAccel", "Eliminate \nStops", "Decreased \nIdle"],
@ -485,49 +542,49 @@ data_lattice_shift_text_right_bottom = [
] ]
data_arabic = [ data_arabic = [
['ً\n\xa0\nﺎﺒﺣﺮﻣ', 'ﻥﺎﻄﻠﺳ\xa0ﻲﻤﺳﺍ'], ["ً\n\xa0\nﺎﺒﺣﺮﻣ", "ﻥﺎﻄﻠﺳ\xa0ﻲﻤﺳﺍ"],
['ﻝﺎﻤﺸﻟﺍ\xa0ﺎﻨﻴﻟﻭﺭﺎﻛ\xa0ﺔﻳﻻﻭ\xa0ﻦﻣ\xa0ﺎﻧﺍ', '؟ﺖﻧﺍ\xa0ﻦﻳﺍ\xa0ﻦﻣ'], ["ﻝﺎﻤﺸﻟﺍ\xa0ﺎﻨﻴﻟﻭﺭﺎﻛ\xa0ﺔﻳﻻﻭ\xa0ﻦﻣ\xa0ﺎﻧﺍ", "؟ﺖﻧﺍ\xa0ﻦﻳﺍ\xa0ﻦﻣ"],
['1234', 'ﻂﻄﻗ\xa047\xa0ﻱﺪﻨﻋ'], ["1234", "ﻂﻄﻗ\xa047\xa0ﻱﺪﻨﻋ"],
['؟ﻙﺎﺒﺷ\xa0ﺖﻧﺍ\xa0ﻞﻫ', 'ﺔﻳﺰﻴﻠﺠﻧﻻﺍ\xa0ﻲﻓ\xa0Jeremy\xa0ﻲﻤﺳﺍ'], ["؟ﻙﺎﺒﺷ\xa0ﺖﻧﺍ\xa0ﻞﻫ", "ﺔﻳﺰﻴﻠﺠﻧﻻﺍ\xa0ﻲﻓ\xa0Jeremy\xa0ﻲﻤﺳﺍ"],
['Jeremy\xa0is\xa0ﻲﻣﺮﺟ\xa0in\xa0Arabic', ''] ["Jeremy\xa0is\xa0ﻲﻣﺮﺟ\xa0in\xa0Arabic", ""]
] ]
data_stream_layout_kwargs = [ data_stream_layout_kwargs = [
['V i n s a u Ve r r e', ''], ["V i n s a u Ve r r e", ""],
['Les Blancs', '12.5CL'], ["Les Blancs", "12.5CL"],
['A.O.P Côtes du Rhône', ''], ["A.O.P Côtes du Rhône", ""],
['Domaine de la Guicharde « Autour de la chapelle » 2016', '8 €'], ["Domaine de la Guicharde « Autour de la chapelle » 2016", "8 €"],
['A.O.P Vacqueyras', ''], ["A.O.P Vacqueyras", ""],
['Domaine de Montvac « Melodine » 2016', '10 €'], ["Domaine de Montvac « Melodine » 2016", "10 €"],
['A.O.P Châteauneuf du Pape', ''], ["A.O.P Châteauneuf du Pape", ""],
['Domaine de Beaurenard 2017', '13 €'], ["Domaine de Beaurenard 2017", "13 €"],
['A.O.P Côteaux du Languedoc', ''], ["A.O.P Côteaux du Languedoc", ""],
['Villa Tempora « Un temps pour elle » 2014', '9 €'], ["Villa Tempora « Un temps pour elle » 2014", "9 €"],
['A.O.P Côtes de Provence', ''], ["A.O.P Côtes de Provence", ""],
['Château Grand Boise 2017', '9 €'], ["Château Grand Boise 2017", "9 €"],
['Les Rosés', '12,5 CL'], ["Les Rosés", "12,5 CL"],
['A.O.P Côtes du Rhône', ''], ["A.O.P Côtes du Rhône", ""],
['Domaine de la Florane « A fleur de Pampre » 2016', '8 €'], ["Domaine de la Florane « A fleur de Pampre » 2016", "8 €"],
['Famille Coulon (Domaine Beaurenard) Biotifulfox 2017', '8 €'], ["Famille Coulon (Domaine Beaurenard) Biotifulfox 2017", "8 €"],
['A.O.P Vacqueyras', ''], ["A.O.P Vacqueyras", ""],
['Domaine de Montvac 2017', '9 €'], ["Domaine de Montvac 2017", "9 €"],
['A.O.P Languedoc', ''], ["A.O.P Languedoc", ""],
['Domaine de Joncas « Nébla » 2015', '8 €'], ["Domaine de Joncas « Nébla » 2015", "8 €"],
['Villa Tempora « Larroseur arrosé » 2015', '9 €'], ["Villa Tempora « Larroseur arrosé » 2015", "9 €"],
['A.O.P Côtes de Provence', ''], ["A.O.P Côtes de Provence", ""],
['Château Grand Boise « Sainte Victoire » 2017', '9 €'], ["Château Grand Boise « Sainte Victoire » 2017", "9 €"],
['Château Léoube 2016', '10 €'], ["Château Léoube 2016", "10 €"],
['Les Rouges', '12,CL'], ["Les Rouges", "12,CL"],
['A.O.P Côtes du Rhône', ''], ["A.O.P Côtes du Rhône", ""],
['Domaine de Dionysos « La Cigalette »', '8 €'], ["Domaine de Dionysos « La Cigalette »", "8 €"],
['Château Saint Estève dUchaux « Grande Réserve » 2014', '9 €'], ["Château Saint Estève dUchaux « Grande Réserve » 2014", "9 €"],
['Domaine de la Guicharde « Cuvée Massillan » 2016', '9 €'], ["Domaine de la Guicharde « Cuvée Massillan » 2016", "9 €"],
['Domaine de la Florane « Terre Pourpre » 2014', '10 €'], ["Domaine de la Florane « Terre Pourpre » 2014", "10 €"],
['LOratoire St Martin « Réserve des Seigneurs » 2015', '11 €'], ["LOratoire St Martin « Réserve des Seigneurs » 2015", "11 €"],
['A.O.P Saint Joseph', ''], ["A.O.P Saint Joseph", ""],
['Domaine Monier Perréol « Châtelet » 2015', '13 €'], ["Domaine Monier Perréol « Châtelet » 2015", "13 €"],
['A.O.P Châteauneuf du Pape', ''], ["A.O.P Châteauneuf du Pape", ""],
['Domaine de Beaurenard 2011', '15 €'], ["Domaine de Beaurenard 2011", "15 €"],
['A.O.P Cornas', ''], ["A.O.P Cornas", ""],
['Domaine Lionnet « Terre Brûlée » 2012', '15 €'] ["Domaine Lionnet « Terre Brûlée » 2012", "15 €"]
] ]

Binary file not shown.

Before

Width:  |  Height:  |  Size: 8.2 KiB

After

Width:  |  Height:  |  Size: 8.2 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 35 KiB

After

Width:  |  Height:  |  Size: 48 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 33 KiB

After

Width:  |  Height:  |  Size: 46 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 6.6 KiB

After

Width:  |  Height:  |  Size: 6.7 KiB

Binary file not shown.

View File

@ -81,7 +81,7 @@ def test_stream_columns():
filename = os.path.join(testdir, "mexican_towns.pdf") filename = os.path.join(testdir, "mexican_towns.pdf")
tables = camelot.read_pdf( tables = camelot.read_pdf(
filename, flavor="stream", columns=["67,180,230,425,475"], row_close_tol=10) filename, flavor="stream", columns=["67,180,230,425,475"], row_tol=10)
assert df.equals(tables[0].df) assert df.equals(tables[0].df)
@ -102,6 +102,22 @@ def test_stream_flag_size():
assert df.equals(tables[0].df) assert df.equals(tables[0].df)
def test_stream_strip_text():
df = pd.DataFrame(data_stream_strip_text)
filename = os.path.join(testdir, "detect_vertical_false.pdf")
tables = camelot.read_pdf(filename, flavor="stream", strip_text="\n")
assert df.equals(tables[0].df)
def test_stream_edge_tol():
df = pd.DataFrame(data_stream_edge_tol)
filename = os.path.join(testdir, "edge_tol.pdf")
tables = camelot.read_pdf(filename, flavor="stream", edge_tol=500)
assert df.equals(tables[0].df)
def test_stream_layout_kwargs(): def test_stream_layout_kwargs():
df = pd.DataFrame(data_stream_layout_kwargs) df = pd.DataFrame(data_stream_layout_kwargs)
@ -188,7 +204,7 @@ def test_repr():
tables = camelot.read_pdf(filename) tables = camelot.read_pdf(filename)
assert repr(tables) == "<TableList n=1>" assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>" assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.42 x2=164.64 y2=233.89>" assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
def test_arabic(): def test_arabic():