Rename kwargs and add tests

pull/2/head
Vinayak Mehta 2018-12-21 15:09:37 +05:30
parent f6aa21c31f
commit 50b4468aff
10 changed files with 193 additions and 113 deletions

View File

@ -70,10 +70,10 @@ def cli(ctx, *args, **kwargs):
@click.option('-shift', '--shift_text', default=['l', 't'], @click.option('-shift', '--shift_text', default=['l', 't'],
type=click.Choice(['', 'l', 'r', 't', 'b']), multiple=True, type=click.Choice(['', 'l', 'r', 't', 'b']), multiple=True,
help='Direction in which text in a spanning cell will flow.') help='Direction in which text in a spanning cell will flow.')
@click.option('-l', '--line_close_tol', default=2, @click.option('-l', '--line_tol', default=2,
help='Tolerance parameter used to merge close vertical' help='Tolerance parameter used to merge close vertical'
' and horizontal lines.') ' and horizontal lines.')
@click.option('-j', '--joint_close_tol', default=2, @click.option('-j', '--joint_tol', default=2,
help='Tolerance parameter used to decide whether' help='Tolerance parameter used to decide whether'
' the detected lines and points lie close to each other.') ' the detected lines and points lie close to each other.')
@click.option('-block', '--threshold_blocksize', default=15, @click.option('-block', '--threshold_blocksize', default=15,
@ -137,11 +137,11 @@ def lattice(c, *args, **kwargs):
' where x1, y1 -> left-top and x2, y2 -> right-bottom.') ' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
@click.option('-C', '--columns', default=[], multiple=True, @click.option('-C', '--columns', default=[], multiple=True,
help='X coordinates of column separators.') help='X coordinates of column separators.')
@click.option('-e', '--edge_close_tol', default=50, help='Tolerance parameter' @click.option('-e', '--edge_tol', default=50, help='Tolerance parameter'
' for extending textedges vertically.') ' for extending textedges vertically.')
@click.option('-r', '--row_close_tol', default=2, help='Tolerance parameter' @click.option('-r', '--row_tol', default=2, help='Tolerance parameter'
' used to combine text vertically, to generate rows.') ' used to combine text vertically, to generate rows.')
@click.option('-c', '--col_close_tol', default=0, help='Tolerance parameter' @click.option('-c', '--column_tol', default=0, help='Tolerance parameter'
' used to combine text horizontally, to generate columns.') ' used to combine text horizontally, to generate columns.')
@click.option('-plot', '--plot_type', @click.option('-plot', '--plot_type',
type=click.Choice(['text', 'grid', 'contour', 'textedge']), type=click.Choice(['text', 'grid', 'contour', 'textedge']),

View File

@ -53,11 +53,11 @@ class TextEdge(object):
return '<TextEdge x={} y0={} y1={} align={} valid={}>'.format( return '<TextEdge x={} y0={} y1={} align={} valid={}>'.format(
round(self.x, 2), round(self.y0, 2), round(self.y1, 2), self.align, self.is_valid) round(self.x, 2), round(self.y0, 2), round(self.y1, 2), self.align, self.is_valid)
def update_coords(self, x, y0, edge_close_tol=50): def update_coords(self, x, y0, edge_tol=50):
"""Updates the text edge's x and bottom y coordinates and sets """Updates the text edge's x and bottom y coordinates and sets
the is_valid attribute. the is_valid attribute.
""" """
if np.isclose(self.y0, y0, atol=edge_close_tol): if np.isclose(self.y0, y0, atol=edge_tol):
self.x = (self.intersections * self.x + x) / float(self.intersections + 1) self.x = (self.intersections * self.x + x) / float(self.intersections + 1)
self.y0 = y0 self.y0 = y0
self.intersections += 1 self.intersections += 1
@ -72,8 +72,8 @@ class TextEdges(object):
the PDF page. The dict has three keys based on the alignments, the PDF page. The dict has three keys based on the alignments,
and each key's value is a list of camelot.core.TextEdge objects. and each key's value is a list of camelot.core.TextEdge objects.
""" """
def __init__(self, edge_close_tol=50): def __init__(self, edge_tol=50):
self.edge_close_tol = edge_close_tol self.edge_tol = edge_tol
self._textedges = {'left': [], 'right': [], 'middle': []} self._textedges = {'left': [], 'right': [], 'middle': []}
@staticmethod @staticmethod
@ -115,7 +115,7 @@ class TextEdges(object):
self.add(textline, align) self.add(textline, align)
else: else:
self._textedges[align][idx].update_coords( self._textedges[align][idx].update_coords(
x_coord, textline.y0, edge_close_tol=self.edge_close_tol) x_coord, textline.y0, edge_tol=self.edge_tol)
def generate(self, textlines): def generate(self, textlines):
"""Generates the text edges dict based on horizontal text """Generates the text edges dict based on horizontal text
@ -359,7 +359,7 @@ class Table(object):
cell.left = cell.right = cell.top = cell.bottom = True cell.left = cell.right = cell.top = cell.bottom = True
return self return self
def set_edges(self, vertical, horizontal, joint_close_tol=2): def set_edges(self, vertical, horizontal, joint_tol=2):
"""Sets a cell's edges to True depending on whether the cell's """Sets a cell's edges to True depending on whether the cell's
coordinates overlap with the line's coordinates within a coordinates overlap with the line's coordinates within a
tolerance. tolerance.
@ -376,11 +376,11 @@ class Table(object):
# find closest x coord # find closest x coord
# iterate over y coords and find closest start and end points # iterate over y coords and find closest start and end points
i = [i for i, t in enumerate(self.cols) i = [i for i, t in enumerate(self.cols)
if np.isclose(v[0], t[0], atol=joint_close_tol)] if np.isclose(v[0], t[0], atol=joint_tol)]
j = [j for j, t in enumerate(self.rows) j = [j for j, t in enumerate(self.rows)
if np.isclose(v[3], t[0], atol=joint_close_tol)] if np.isclose(v[3], t[0], atol=joint_tol)]
k = [k for k, t in enumerate(self.rows) k = [k for k, t in enumerate(self.rows)
if np.isclose(v[1], t[0], atol=joint_close_tol)] if np.isclose(v[1], t[0], atol=joint_tol)]
if not j: if not j:
continue continue
J = j[0] J = j[0]
@ -427,11 +427,11 @@ class Table(object):
# find closest y coord # find closest y coord
# iterate over x coords and find closest start and end points # iterate over x coords and find closest start and end points
i = [i for i, t in enumerate(self.rows) i = [i for i, t in enumerate(self.rows)
if np.isclose(h[1], t[0], atol=joint_close_tol)] if np.isclose(h[1], t[0], atol=joint_tol)]
j = [j for j, t in enumerate(self.cols) j = [j for j, t in enumerate(self.cols)
if np.isclose(h[0], t[0], atol=joint_close_tol)] if np.isclose(h[0], t[0], atol=joint_tol)]
k = [k for k, t in enumerate(self.cols) k = [k for k, t in enumerate(self.cols)
if np.isclose(h[2], t[0], atol=joint_close_tol)] if np.isclose(h[2], t[0], atol=joint_tol)]
if not j: if not j:
continue continue
J = j[0] J = j[0]

View File

@ -40,10 +40,13 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice',
flag_size : bool, optional (default: False) flag_size : bool, optional (default: False)
Flag text based on font size. Useful to detect Flag text based on font size. Useful to detect
super/subscripts. Adds <s></s> around flagged text. super/subscripts. Adds <s></s> around flagged text.
row_close_tol^ : int, optional (default: 2) strip_text : str, optional (default: '')
Characters that should be stripped from a string before
assigning it to a cell.
row_tol^ : int, optional (default: 2)
Tolerance parameter used to combine text vertically, Tolerance parameter used to combine text vertically,
to generate rows. to generate rows.
col_close_tol^ : int, optional (default: 0) column_tol^ : int, optional (default: 0)
Tolerance parameter used to combine text horizontally, Tolerance parameter used to combine text horizontally,
to generate columns. to generate columns.
process_background* : bool, optional (default: False) process_background* : bool, optional (default: False)
@ -59,10 +62,10 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice',
shift_text* : list, optional (default: ['l', 't']) shift_text* : list, optional (default: ['l', 't'])
{'l', 'r', 't', 'b'} {'l', 'r', 't', 'b'}
Direction in which text in a spanning cell will flow. Direction in which text in a spanning cell will flow.
line_close_tol* : int, optional (default: 2) line_tol* : int, optional (default: 2)
Tolerance parameter used to merge close vertical and horizontal Tolerance parameter used to merge close vertical and horizontal
lines. lines.
joint_close_tol* : int, optional (default: 2) joint_tol* : int, optional (default: 2)
Tolerance parameter used to decide whether the detected lines Tolerance parameter used to decide whether the detected lines
and points lie close to each other. and points lie close to each other.
threshold_blocksize* : int, optional (default: 15) threshold_blocksize* : int, optional (default: 15)
@ -79,6 +82,8 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice',
Number of times for erosion/dilation is applied. Number of times for erosion/dilation is applied.
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_. For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
resolution* : int, optional (default: 300)
Resolution used for PDF to PNG conversion.
Returns Returns
------- -------

View File

@ -47,16 +47,16 @@ class Lattice(BaseParser):
Direction in which text in a spanning cell will flow. Direction in which text in a spanning cell will flow.
split_text : bool, optional (default: False) split_text : bool, optional (default: False)
Split text that spans across multiple cells. Split text that spans across multiple cells.
strip_text : str, optional (default: '')
Characters that should be stripped from a string before
assigning it to a cell.
flag_size : bool, optional (default: False) flag_size : bool, optional (default: False)
Flag text based on font size. Useful to detect Flag text based on font size. Useful to detect
super/subscripts. Adds <s></s> around flagged text. super/subscripts. Adds <s></s> around flagged text.
line_close_tol : int, optional (default: 2) strip_text : str, optional (default: '')
Characters that should be stripped from a string before
assigning it to a cell.
line_tol : int, optional (default: 2)
Tolerance parameter used to merge close vertical and horizontal Tolerance parameter used to merge close vertical and horizontal
lines. lines.
joint_close_tol : int, optional (default: 2) joint_tol : int, optional (default: 2)
Tolerance parameter used to decide whether the detected lines Tolerance parameter used to decide whether the detected lines
and points lie close to each other. and points lie close to each other.
threshold_blocksize : int, optional (default: 15) threshold_blocksize : int, optional (default: 15)
@ -73,12 +73,14 @@ class Lattice(BaseParser):
Number of times for erosion/dilation is applied. Number of times for erosion/dilation is applied.
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_. For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
resolution : int, optional (default: 300)
Resolution used for PDF to PNG conversion.
""" """
def __init__(self, table_areas=None, process_background=False, def __init__(self, table_areas=None, process_background=False,
line_size_scaling=15, copy_text=None, shift_text=['l', 't'], line_size_scaling=15, copy_text=None, shift_text=['l', 't'],
split_text=False, flag_size=False, strip_text='', line_close_tol=2, split_text=False, flag_size=False, strip_text='', line_tol=2,
joint_close_tol=2, threshold_blocksize=15, threshold_constant=-2, joint_tol=2, threshold_blocksize=15, threshold_constant=-2,
iterations=0, resolution=300, **kwargs): iterations=0, resolution=300, **kwargs):
self.table_areas = table_areas self.table_areas = table_areas
self.process_background = process_background self.process_background = process_background
@ -88,8 +90,8 @@ class Lattice(BaseParser):
self.split_text = split_text self.split_text = split_text
self.flag_size = flag_size self.flag_size = flag_size
self.strip_text = strip_text self.strip_text = strip_text
self.line_close_tol = line_close_tol self.line_tol = line_tol
self.joint_close_tol = joint_close_tol self.joint_tol = joint_tol
self.threshold_blocksize = threshold_blocksize self.threshold_blocksize = threshold_blocksize
self.threshold_constant = threshold_constant self.threshold_constant = threshold_constant
self.iterations = iterations self.iterations = iterations
@ -283,9 +285,9 @@ class Lattice(BaseParser):
rows.extend([tk[1], tk[3]]) rows.extend([tk[1], tk[3]])
# sort horizontal and vertical segments # sort horizontal and vertical segments
cols = merge_close_lines( cols = merge_close_lines(
sorted(cols), line_close_tol=self.line_close_tol) sorted(cols), line_tol=self.line_tol)
rows = merge_close_lines( rows = merge_close_lines(
sorted(rows, reverse=True), line_close_tol=self.line_close_tol) sorted(rows, reverse=True), line_tol=self.line_tol)
# make grid using x and y coord of shortlisted rows and cols # make grid using x and y coord of shortlisted rows and cols
cols = [(cols[i], cols[i + 1]) cols = [(cols[i], cols[i + 1])
for i in range(0, len(cols) - 1)] for i in range(0, len(cols) - 1)]
@ -302,7 +304,7 @@ class Lattice(BaseParser):
table = Table(cols, rows) table = Table(cols, rows)
# set table edges to True using ver+hor lines # set table edges to True using ver+hor lines
table = table.set_edges(v_s, h_s, joint_close_tol=self.joint_close_tol) table = table.set_edges(v_s, h_s, joint_tol=self.joint_tol)
# set table border edges to True # set table border edges to True
table = table.set_border() table = table.set_border()
# set spanning cells to True # set spanning cells to True
@ -315,7 +317,7 @@ class Lattice(BaseParser):
for t in self.t_bbox[direction]: for t in self.t_bbox[direction]:
indices, error = get_table_index( indices, error = get_table_index(
table, t, direction, split_text=self.split_text, table, t, direction, split_text=self.split_text,
flag_size=self.flag_size) flag_size=self.flag_size, strip_text=self.strip_text)
if indices[:2] != (-1, -1): if indices[:2] != (-1, -1):
pos_errors.append(error) pos_errors.append(error)
indices = Lattice._reduce_index(table, indices, shift_text=self.shift_text) indices = Lattice._reduce_index(table, indices, shift_text=self.shift_text)

View File

@ -35,34 +35,34 @@ class Stream(BaseParser):
are comma-separated. are comma-separated.
split_text : bool, optional (default: False) split_text : bool, optional (default: False)
Split text that spans across multiple cells. Split text that spans across multiple cells.
strip_text : str, optional (default: '')
Characters that should be stripped from a string before
assigning it to a cell.
flag_size : bool, optional (default: False) flag_size : bool, optional (default: False)
Flag text based on font size. Useful to detect Flag text based on font size. Useful to detect
super/subscripts. Adds <s></s> around flagged text. super/subscripts. Adds <s></s> around flagged text.
edge_close_tol : int, optional (default: 50) strip_text : str, optional (default: '')
Characters that should be stripped from a string before
assigning it to a cell.
edge_tol : int, optional (default: 50)
Tolerance parameter for extending textedges vertically. Tolerance parameter for extending textedges vertically.
row_close_tol : int, optional (default: 2) row_tol : int, optional (default: 2)
Tolerance parameter used to combine text vertically, Tolerance parameter used to combine text vertically,
to generate rows. to generate rows.
col_close_tol : int, optional (default: 0) column_tol : int, optional (default: 0)
Tolerance parameter used to combine text horizontally, Tolerance parameter used to combine text horizontally,
to generate columns. to generate columns.
""" """
def __init__(self, table_areas=None, columns=None, split_text=False, def __init__(self, table_areas=None, columns=None, split_text=False,
flag_size=False, strip_text='', edge_close_tol=50, row_close_tol=2, flag_size=False, strip_text='', edge_tol=50, row_tol=2,
col_close_tol=0, **kwargs): column_tol=0, **kwargs):
self.table_areas = table_areas self.table_areas = table_areas
self.columns = columns self.columns = columns
self._validate_columns() self._validate_columns()
self.split_text = split_text self.split_text = split_text
self.flag_size = flag_size self.flag_size = flag_size
self.strip_text = strip_text self.strip_text = strip_text
self.edge_close_tol = edge_close_tol self.edge_tol = edge_tol
self.row_close_tol = row_close_tol self.row_tol = row_tol
self.col_close_tol = col_close_tol self.column_tol = column_tol
@staticmethod @staticmethod
def _text_bbox(t_bbox): def _text_bbox(t_bbox):
@ -88,7 +88,7 @@ class Stream(BaseParser):
return text_bbox return text_bbox
@staticmethod @staticmethod
def _group_rows(text, row_close_tol=2): def _group_rows(text, row_tol=2):
"""Groups PDFMiner text objects into rows vertically """Groups PDFMiner text objects into rows vertically
within a tolerance. within a tolerance.
@ -96,7 +96,7 @@ class Stream(BaseParser):
---------- ----------
text : list text : list
List of PDFMiner text objects. List of PDFMiner text objects.
row_close_tol : int, optional (default: 2) row_tol : int, optional (default: 2)
Returns Returns
------- -------
@ -112,7 +112,7 @@ class Stream(BaseParser):
# if t.get_text().strip() and all([obj.upright for obj in t._objs if # if t.get_text().strip() and all([obj.upright for obj in t._objs if
# type(obj) is LTChar]): # type(obj) is LTChar]):
if t.get_text().strip(): if t.get_text().strip():
if not np.isclose(row_y, t.y0, atol=row_close_tol): if not np.isclose(row_y, t.y0, atol=row_tol):
rows.append(sorted(temp, key=lambda t: t.x0)) rows.append(sorted(temp, key=lambda t: t.x0))
temp = [] temp = []
row_y = t.y0 row_y = t.y0
@ -122,7 +122,7 @@ class Stream(BaseParser):
return rows return rows
@staticmethod @staticmethod
def _merge_columns(l, col_close_tol=0): def _merge_columns(l, column_tol=0):
"""Merges column boundaries horizontally if they overlap """Merges column boundaries horizontally if they overlap
or lie within a tolerance. or lie within a tolerance.
@ -130,7 +130,7 @@ class Stream(BaseParser):
---------- ----------
l : list l : list
List of column x-coordinate tuples. List of column x-coordinate tuples.
col_close_tol : int, optional (default: 0) column_tol : int, optional (default: 0)
Returns Returns
------- -------
@ -144,17 +144,17 @@ class Stream(BaseParser):
merged.append(higher) merged.append(higher)
else: else:
lower = merged[-1] lower = merged[-1]
if col_close_tol >= 0: if column_tol >= 0:
if (higher[0] <= lower[1] or if (higher[0] <= lower[1] or
np.isclose(higher[0], lower[1], atol=col_close_tol)): np.isclose(higher[0], lower[1], atol=column_tol)):
upper_bound = max(lower[1], higher[1]) upper_bound = max(lower[1], higher[1])
lower_bound = min(lower[0], higher[0]) lower_bound = min(lower[0], higher[0])
merged[-1] = (lower_bound, upper_bound) merged[-1] = (lower_bound, upper_bound)
else: else:
merged.append(higher) merged.append(higher)
elif col_close_tol < 0: elif column_tol < 0:
if higher[0] <= lower[1]: if higher[0] <= lower[1]:
if np.isclose(higher[0], lower[1], atol=abs(col_close_tol)): if np.isclose(higher[0], lower[1], atol=abs(column_tol)):
merged.append(higher) merged.append(higher)
else: else:
upper_bound = max(lower[1], higher[1]) upper_bound = max(lower[1], higher[1])
@ -191,7 +191,7 @@ class Stream(BaseParser):
return rows return rows
@staticmethod @staticmethod
def _add_columns(cols, text, row_close_tol): def _add_columns(cols, text, row_tol):
"""Adds columns to existing list by taking into account """Adds columns to existing list by taking into account
the text that lies outside the current column x-coordinates. the text that lies outside the current column x-coordinates.
@ -210,7 +210,7 @@ class Stream(BaseParser):
""" """
if text: if text:
text = Stream._group_rows(text, row_close_tol=row_close_tol) text = Stream._group_rows(text, row_tol=row_tol)
elements = [len(r) for r in text] elements = [len(r) for r in text]
new_cols = [(t.x0, t.x1) new_cols = [(t.x0, t.x1)
for r in text if len(r) == max(elements) for t in r] for r in text if len(r) == max(elements) for t in r]
@ -259,7 +259,7 @@ class Stream(BaseParser):
# TODO: add support for arabic text #141 # TODO: add support for arabic text #141
# sort textlines in reading order # sort textlines in reading order
textlines.sort(key=lambda x: (-x.y0, x.x0)) textlines.sort(key=lambda x: (-x.y0, x.x0))
textedges = TextEdges(edge_close_tol=self.edge_close_tol) textedges = TextEdges(edge_tol=self.edge_tol)
# generate left, middle and right textedges # generate left, middle and right textedges
textedges.generate(textlines) textedges.generate(textlines)
# select relevant edges # select relevant edges
@ -301,7 +301,7 @@ class Stream(BaseParser):
self.t_bbox = t_bbox self.t_bbox = t_bbox
text_x_min, text_y_min, text_x_max, text_y_max = self._text_bbox(self.t_bbox) text_x_min, text_y_min, text_x_max, text_y_max = self._text_bbox(self.t_bbox)
rows_grouped = self._group_rows(self.t_bbox['horizontal'], row_close_tol=self.row_close_tol) rows_grouped = self._group_rows(self.t_bbox['horizontal'], row_tol=self.row_tol)
rows = self._join_rows(rows_grouped, text_y_max, text_y_min) rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
elements = [len(r) for r in rows_grouped] elements = [len(r) for r in rows_grouped]
@ -332,7 +332,7 @@ class Stream(BaseParser):
warnings.warn("No tables found in table area {}".format( warnings.warn("No tables found in table area {}".format(
table_idx + 1)) table_idx + 1))
cols = [(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r] cols = [(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r]
cols = self._merge_columns(sorted(cols), col_close_tol=self.col_close_tol) cols = self._merge_columns(sorted(cols), column_tol=self.column_tol)
inner_text = [] inner_text = []
for i in range(1, len(cols)): for i in range(1, len(cols)):
left = cols[i - 1][1] left = cols[i - 1][1]
@ -344,7 +344,7 @@ class Stream(BaseParser):
for t in self.t_bbox[direction] for t in self.t_bbox[direction]
if t.x0 > cols[-1][1] or t.x1 < cols[0][0]] if t.x0 > cols[-1][1] or t.x1 < cols[0][0]]
inner_text.extend(outer_text) inner_text.extend(outer_text)
cols = self._add_columns(cols, inner_text, self.row_close_tol) cols = self._add_columns(cols, inner_text, self.row_tol)
cols = self._join_columns(cols, text_x_min, text_x_max) cols = self._join_columns(cols, text_x_min, text_x_max)
return cols, rows return cols, rows
@ -360,7 +360,7 @@ class Stream(BaseParser):
for t in self.t_bbox[direction]: for t in self.t_bbox[direction]:
indices, error = get_table_index( indices, error = get_table_index(
table, t, direction, split_text=self.split_text, table, t, direction, split_text=self.split_text,
flag_size=self.flag_size) flag_size=self.flag_size, strip_text=self.strip_text)
if indices[:2] != (-1, -1): if indices[:2] != (-1, -1):
pos_errors.append(error) pos_errors.append(error)
for r_idx, c_idx, text in indices: for r_idx, c_idx, text in indices:

View File

@ -20,16 +20,16 @@ from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal,
stream_kwargs = [ stream_kwargs = [
'columns', 'columns',
'row_close_tol', 'row_tol',
'col_close_tol' 'column_tol'
] ]
lattice_kwargs = [ lattice_kwargs = [
'process_background', 'process_background',
'line_size_scaling', 'line_size_scaling',
'copy_text', 'copy_text',
'shift_text', 'shift_text',
'line_close_tol', 'line_tol',
'joint_close_tol', 'joint_tol',
'threshold_blocksize', 'threshold_blocksize',
'threshold_constant', 'threshold_constant',
'iterations' 'iterations'
@ -281,14 +281,14 @@ def text_in_bbox(bbox, text):
return t_bbox return t_bbox
def merge_close_lines(ar, line_close_tol=2): def merge_close_lines(ar, line_tol=2):
"""Merges lines which are within a tolerance by calculating a """Merges lines which are within a tolerance by calculating a
moving mean, based on their x or y axis projections. moving mean, based on their x or y axis projections.
Parameters Parameters
---------- ----------
ar : list ar : list
line_close_tol : int, optional (default: 2) line_tol : int, optional (default: 2)
Returns Returns
------- -------
@ -301,7 +301,7 @@ def merge_close_lines(ar, line_close_tol=2):
ret.append(a) ret.append(a)
else: else:
temp = ret[-1] temp = ret[-1]
if np.isclose(temp, a, atol=line_close_tol): if np.isclose(temp, a, atol=line_tol):
temp = (temp + a) / 2.0 temp = (temp + a) / 2.0
ret[-1] = temp ret[-1] = temp
else: else:

View File

@ -319,7 +319,7 @@ You can solve this by passing ``flag_size=True``, which will enclose the supersc
Control how text is grouped into rows Control how text is grouped into rows
------------------------------------- -------------------------------------
You can pass ``row_close_tol=<+int>`` to group the rows closer together, as shown below. You can pass ``row_tol=<+int>`` to group the rows closer together, as shown below.
:: ::
@ -337,7 +337,7 @@ You can pass ``row_close_tol=<+int>`` to group the rows closer together, as show
:: ::
>>> tables = camelot.read_pdf('group_rows.pdf', flavor='stream', row_close_tol=10) >>> tables = camelot.read_pdf('group_rows.pdf', flavor='stream', row_tol=10)
>>> tables[0].df >>> tables[0].df
.. tip:: .. tip::

View File

@ -312,6 +312,63 @@ data_stream_flag_size = [
["ALL STATES", "513.38", "436.02", "-", "25.57", "51.06", "14.18", "-", "8.21", "11.83", "11.08"] ["ALL STATES", "513.38", "436.02", "-", "25.57", "51.06", "14.18", "-", "8.21", "11.83", "11.08"]
] ]
data_stream_strip_text = [
["V i n s a u Ve r r e", ""],
["Les Blancs", "12.5CL"],
["A.O.P Côtes du Rhône", ""],
["Domaine de la Guicharde « Autour de la chapelle » 2016", "8 €"],
["A.O.P Vacqueyras", ""],
["Domaine de Montvac « Melodine » 2016", "10 €"],
["A.O.P Châteauneuf du Pape", ""],
["Domaine de Beaurenard 2017", "13 €"],
["A.O.P Côteaux du Languedoc", ""],
["Villa Tempora « Un temps pour elle » 2014", "9 €"],
["A.O.P Côtes de Provence", ""],
["Château Grand Boise 2017", "9 €"],
["Les Rosés", "12,5 CL"],
["A.O.P Côtes du Rhône", ""],
["Domaine de la Florane « A fleur de Pampre » 2016", "8 €"],
["Famille Coulon (Domaine Beaurenard) Biotifulfox 2017", "8 €"],
["A.O.P Vacqueyras", ""],
["Domaine de Montvac 2017", "9 €"],
["A.O.P Languedoc", ""],
["Domaine de Joncas « Nébla » 2015", "8 €"],
["Villa Tempora « Larroseur arrosé » 2015", "9 €"],
["A.O.P Côtes de Provence", ""],
["Château Grand Boise « Sainte Victoire » 2017", "9 €"],
["Château Léoube 2016", "10 €"]
]
data_stream_edge_tol = [
["Key figures", ""],
["", "2016"],
["(all amounts in EUR)", ""],
["C\nlass A", ""],
["N\net Asset Value at 31 December", "5,111,372"],
["N\number of outstanding units at 31 December", "49,136"],
["N\net Asset Value per unit at 31 December", "104.03"],
["C\nlass B", ""],
["N\net Asset Value at 31 December", "49,144,825"],
["N\number of outstanding units at 31 December", "471,555"],
["N\net Asset Value per unit at 31 December", "104.22"],
["T\notal for the Fund", ""],
["N\net Asset Value at 31 December", "54,256,197"],
["N\number of outstanding units at 31 December", "520,691"],
["I\nnvestment result", ""],
["Direct result", "-"],
["Revaluation", "2,076,667"],
["Costs", "(106,870)"],
["T\notal investment result for the period1", "1,969,797"],
["I\nnvestment result per unit2", ""],
["Direct result", "-"],
["Revaluation", "3.99"],
["Costs", "(0.21)"],
["T\notal investment result per unit", "3.78"],
["1 The results cover the period from inception of the Fund at 8 April 2016 through 31 December 2016.", ""],
["2 The result per unit is calculated using the total number of outstanding unit as per the end of the", ""],
["period.", ""]
]
data_lattice = [ data_lattice = [
["Cycle \nName", "KI \n(1/km)", "Distance \n(mi)", "Percent Fuel Savings", "", "", ""], ["Cycle \nName", "KI \n(1/km)", "Distance \n(mi)", "Percent Fuel Savings", "", "", ""],
["", "", "", "Improved \nSpeed", "Decreased \nAccel", "Eliminate \nStops", "Decreased \nIdle"], ["", "", "", "Improved \nSpeed", "Decreased \nAccel", "Eliminate \nStops", "Decreased \nIdle"],
@ -485,49 +542,49 @@ data_lattice_shift_text_right_bottom = [
] ]
data_arabic = [ data_arabic = [
['ً\n\xa0\nﺎﺒﺣﺮﻣ', 'ﻥﺎﻄﻠﺳ\xa0ﻲﻤﺳﺍ'], ["ً\n\xa0\nﺎﺒﺣﺮﻣ", "ﻥﺎﻄﻠﺳ\xa0ﻲﻤﺳﺍ"],
['ﻝﺎﻤﺸﻟﺍ\xa0ﺎﻨﻴﻟﻭﺭﺎﻛ\xa0ﺔﻳﻻﻭ\xa0ﻦﻣ\xa0ﺎﻧﺍ', '؟ﺖﻧﺍ\xa0ﻦﻳﺍ\xa0ﻦﻣ'], ["ﻝﺎﻤﺸﻟﺍ\xa0ﺎﻨﻴﻟﻭﺭﺎﻛ\xa0ﺔﻳﻻﻭ\xa0ﻦﻣ\xa0ﺎﻧﺍ", "؟ﺖﻧﺍ\xa0ﻦﻳﺍ\xa0ﻦﻣ"],
['1234', 'ﻂﻄﻗ\xa047\xa0ﻱﺪﻨﻋ'], ["1234", "ﻂﻄﻗ\xa047\xa0ﻱﺪﻨﻋ"],
['؟ﻙﺎﺒﺷ\xa0ﺖﻧﺍ\xa0ﻞﻫ', 'ﺔﻳﺰﻴﻠﺠﻧﻻﺍ\xa0ﻲﻓ\xa0Jeremy\xa0ﻲﻤﺳﺍ'], ["؟ﻙﺎﺒﺷ\xa0ﺖﻧﺍ\xa0ﻞﻫ", "ﺔﻳﺰﻴﻠﺠﻧﻻﺍ\xa0ﻲﻓ\xa0Jeremy\xa0ﻲﻤﺳﺍ"],
['Jeremy\xa0is\xa0ﻲﻣﺮﺟ\xa0in\xa0Arabic', ''] ["Jeremy\xa0is\xa0ﻲﻣﺮﺟ\xa0in\xa0Arabic", ""]
] ]
data_stream_layout_kwargs = [ data_stream_layout_kwargs = [
['V i n s a u Ve r r e', ''], ["V i n s a u Ve r r e", ""],
['Les Blancs', '12.5CL'], ["Les Blancs", "12.5CL"],
['A.O.P Côtes du Rhône', ''], ["A.O.P Côtes du Rhône", ""],
['Domaine de la Guicharde « Autour de la chapelle » 2016', '8 €'], ["Domaine de la Guicharde « Autour de la chapelle » 2016", "8 €"],
['A.O.P Vacqueyras', ''], ["A.O.P Vacqueyras", ""],
['Domaine de Montvac « Melodine » 2016', '10 €'], ["Domaine de Montvac « Melodine » 2016", "10 €"],
['A.O.P Châteauneuf du Pape', ''], ["A.O.P Châteauneuf du Pape", ""],
['Domaine de Beaurenard 2017', '13 €'], ["Domaine de Beaurenard 2017", "13 €"],
['A.O.P Côteaux du Languedoc', ''], ["A.O.P Côteaux du Languedoc", ""],
['Villa Tempora « Un temps pour elle » 2014', '9 €'], ["Villa Tempora « Un temps pour elle » 2014", "9 €"],
['A.O.P Côtes de Provence', ''], ["A.O.P Côtes de Provence", ""],
['Château Grand Boise 2017', '9 €'], ["Château Grand Boise 2017", "9 €"],
['Les Rosés', '12,5 CL'], ["Les Rosés", "12,5 CL"],
['A.O.P Côtes du Rhône', ''], ["A.O.P Côtes du Rhône", ""],
['Domaine de la Florane « A fleur de Pampre » 2016', '8 €'], ["Domaine de la Florane « A fleur de Pampre » 2016", "8 €"],
['Famille Coulon (Domaine Beaurenard) Biotifulfox 2017', '8 €'], ["Famille Coulon (Domaine Beaurenard) Biotifulfox 2017", "8 €"],
['A.O.P Vacqueyras', ''], ["A.O.P Vacqueyras", ""],
['Domaine de Montvac 2017', '9 €'], ["Domaine de Montvac 2017", "9 €"],
['A.O.P Languedoc', ''], ["A.O.P Languedoc", ""],
['Domaine de Joncas « Nébla » 2015', '8 €'], ["Domaine de Joncas « Nébla » 2015", "8 €"],
['Villa Tempora « Larroseur arrosé » 2015', '9 €'], ["Villa Tempora « Larroseur arrosé » 2015", "9 €"],
['A.O.P Côtes de Provence', ''], ["A.O.P Côtes de Provence", ""],
['Château Grand Boise « Sainte Victoire » 2017', '9 €'], ["Château Grand Boise « Sainte Victoire » 2017", "9 €"],
['Château Léoube 2016', '10 €'], ["Château Léoube 2016", "10 €"],
['Les Rouges', '12,CL'], ["Les Rouges", "12,CL"],
['A.O.P Côtes du Rhône', ''], ["A.O.P Côtes du Rhône", ""],
['Domaine de Dionysos « La Cigalette »', '8 €'], ["Domaine de Dionysos « La Cigalette »", "8 €"],
['Château Saint Estève dUchaux « Grande Réserve » 2014', '9 €'], ["Château Saint Estève dUchaux « Grande Réserve » 2014", "9 €"],
['Domaine de la Guicharde « Cuvée Massillan » 2016', '9 €'], ["Domaine de la Guicharde « Cuvée Massillan » 2016", "9 €"],
['Domaine de la Florane « Terre Pourpre » 2014', '10 €'], ["Domaine de la Florane « Terre Pourpre » 2014", "10 €"],
['LOratoire St Martin « Réserve des Seigneurs » 2015', '11 €'], ["LOratoire St Martin « Réserve des Seigneurs » 2015", "11 €"],
['A.O.P Saint Joseph', ''], ["A.O.P Saint Joseph", ""],
['Domaine Monier Perréol « Châtelet » 2015', '13 €'], ["Domaine Monier Perréol « Châtelet » 2015", "13 €"],
['A.O.P Châteauneuf du Pape', ''], ["A.O.P Châteauneuf du Pape", ""],
['Domaine de Beaurenard 2011', '15 €'], ["Domaine de Beaurenard 2011", "15 €"],
['A.O.P Cornas', ''], ["A.O.P Cornas", ""],
['Domaine Lionnet « Terre Brûlée » 2012', '15 €'] ["Domaine Lionnet « Terre Brûlée » 2012", "15 €"]
] ]

Binary file not shown.

View File

@ -81,7 +81,7 @@ def test_stream_columns():
filename = os.path.join(testdir, "mexican_towns.pdf") filename = os.path.join(testdir, "mexican_towns.pdf")
tables = camelot.read_pdf( tables = camelot.read_pdf(
filename, flavor="stream", columns=["67,180,230,425,475"], row_close_tol=10) filename, flavor="stream", columns=["67,180,230,425,475"], row_tol=10)
assert df.equals(tables[0].df) assert df.equals(tables[0].df)
@ -102,6 +102,22 @@ def test_stream_flag_size():
assert df.equals(tables[0].df) assert df.equals(tables[0].df)
def test_stream_strip_text():
df = pd.DataFrame(data_stream_strip_text)
filename = os.path.join(testdir, "detect_vertical_false.pdf")
tables = camelot.read_pdf(filename, flavor="stream", strip_text="\n")
assert df.equals(tables[0].df)
def test_stream_edge_tol():
df = pd.DataFrame(data_stream_edge_tol)
filename = os.path.join(testdir, "edge_tolerance.pdf")
tables = camelot.read_pdf(filename, flavor="stream", edge_tol=500)
assert df.equals(tables[0].df)
def test_stream_layout_kwargs(): def test_stream_layout_kwargs():
df = pd.DataFrame(data_stream_layout_kwargs) df = pd.DataFrame(data_stream_layout_kwargs)