Fix merge conflict
|
|
@ -4,6 +4,12 @@ Release History
|
||||||
master
|
master
|
||||||
------
|
------
|
||||||
|
|
||||||
|
**Improvements**
|
||||||
|
|
||||||
|
* [#170](https://github.com/socialcopsdev/camelot/issues/170) Add option to pass pdfminer layout kwargs. [#232](https://github.com/socialcopsdev/camelot/pull/232) by Vinayak Mehta.
|
||||||
|
* Keyword arguments for [pdfminer.layout.LAParams](https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33) can now be passed using `layout_kwargs` in `read_pdf()`.
|
||||||
|
* The `margins` keyword argument in `read_pdf()` is now deprecated.
|
||||||
|
|
||||||
0.5.0 (2018-12-13)
|
0.5.0 (2018-12-13)
|
||||||
------------------
|
------------------
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -43,6 +43,8 @@ pass_config = click.make_pass_decorator(Config)
|
||||||
help='Split text that spans across multiple cells.')
|
help='Split text that spans across multiple cells.')
|
||||||
@click.option('-flag', '--flag_size', is_flag=True, help='Flag text based on'
|
@click.option('-flag', '--flag_size', is_flag=True, help='Flag text based on'
|
||||||
' font size. Useful to detect super/subscripts.')
|
' font size. Useful to detect super/subscripts.')
|
||||||
|
@click.option('-strip', '--strip_text', help='Characters that should be stripped from a string before'
|
||||||
|
' assigning it to a cell.')
|
||||||
@click.option('-M', '--margins', nargs=3, default=(1.0, 0.5, 0.1),
|
@click.option('-M', '--margins', nargs=3, default=(1.0, 0.5, 0.1),
|
||||||
help='PDFMiner char_margin, line_margin and word_margin.')
|
help='PDFMiner char_margin, line_margin and word_margin.')
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
|
|
@ -68,10 +70,10 @@ def cli(ctx, *args, **kwargs):
|
||||||
@click.option('-shift', '--shift_text', default=['l', 't'],
|
@click.option('-shift', '--shift_text', default=['l', 't'],
|
||||||
type=click.Choice(['', 'l', 'r', 't', 'b']), multiple=True,
|
type=click.Choice(['', 'l', 'r', 't', 'b']), multiple=True,
|
||||||
help='Direction in which text in a spanning cell will flow.')
|
help='Direction in which text in a spanning cell will flow.')
|
||||||
@click.option('-l', '--line_close_tol', default=2,
|
@click.option('-l', '--line_tol', default=2,
|
||||||
help='Tolerance parameter used to merge close vertical'
|
help='Tolerance parameter used to merge close vertical'
|
||||||
' and horizontal lines.')
|
' and horizontal lines.')
|
||||||
@click.option('-j', '--joint_close_tol', default=2,
|
@click.option('-j', '--joint_tol', default=2,
|
||||||
help='Tolerance parameter used to decide whether'
|
help='Tolerance parameter used to decide whether'
|
||||||
' the detected lines and points lie close to each other.')
|
' the detected lines and points lie close to each other.')
|
||||||
@click.option('-block', '--threshold_blocksize', default=15,
|
@click.option('-block', '--threshold_blocksize', default=15,
|
||||||
|
|
@ -84,6 +86,8 @@ def cli(ctx, *args, **kwargs):
|
||||||
' may be zero or negative as well.')
|
' may be zero or negative as well.')
|
||||||
@click.option('-I', '--iterations', default=0,
|
@click.option('-I', '--iterations', default=0,
|
||||||
help='Number of times for erosion/dilation will be applied.')
|
help='Number of times for erosion/dilation will be applied.')
|
||||||
|
@click.option('-res', '--resolution', default=300,
|
||||||
|
help='Resolution used for PDF to PNG conversion.')
|
||||||
@click.option('-plot', '--plot_type',
|
@click.option('-plot', '--plot_type',
|
||||||
type=click.Choice(['text', 'grid', 'contour', 'joint', 'line']),
|
type=click.Choice(['text', 'grid', 'contour', 'joint', 'line']),
|
||||||
help='Plot elements found on PDF page for visual debugging.')
|
help='Plot elements found on PDF page for visual debugging.')
|
||||||
|
|
@ -133,9 +137,11 @@ def lattice(c, *args, **kwargs):
|
||||||
' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
|
' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
|
||||||
@click.option('-C', '--columns', default=[], multiple=True,
|
@click.option('-C', '--columns', default=[], multiple=True,
|
||||||
help='X coordinates of column separators.')
|
help='X coordinates of column separators.')
|
||||||
@click.option('-r', '--row_close_tol', default=2, help='Tolerance parameter'
|
@click.option('-e', '--edge_tol', default=50, help='Tolerance parameter'
|
||||||
|
' for extending textedges vertically.')
|
||||||
|
@click.option('-r', '--row_tol', default=2, help='Tolerance parameter'
|
||||||
' used to combine text vertically, to generate rows.')
|
' used to combine text vertically, to generate rows.')
|
||||||
@click.option('-c', '--col_close_tol', default=0, help='Tolerance parameter'
|
@click.option('-c', '--column_tol', default=0, help='Tolerance parameter'
|
||||||
' used to combine text horizontally, to generate columns.')
|
' used to combine text horizontally, to generate columns.')
|
||||||
@click.option('-plot', '--plot_type',
|
@click.option('-plot', '--plot_type',
|
||||||
type=click.Choice(['text', 'grid', 'contour', 'textedge']),
|
type=click.Choice(['text', 'grid', 'contour', 'textedge']),
|
||||||
|
|
|
||||||
|
|
@ -13,8 +13,6 @@ import pandas as pd
|
||||||
# minimum number of vertical textline intersections for a textedge
|
# minimum number of vertical textline intersections for a textedge
|
||||||
# to be considered valid
|
# to be considered valid
|
||||||
TEXTEDGE_REQUIRED_ELEMENTS = 4
|
TEXTEDGE_REQUIRED_ELEMENTS = 4
|
||||||
# y coordinate tolerance for extending textedge
|
|
||||||
TEXTEDGE_EXTEND_TOLERANCE = 50
|
|
||||||
# padding added to table area on the left, right and bottom
|
# padding added to table area on the left, right and bottom
|
||||||
TABLE_AREA_PADDING = 10
|
TABLE_AREA_PADDING = 10
|
||||||
|
|
||||||
|
|
@ -55,11 +53,11 @@ class TextEdge(object):
|
||||||
return '<TextEdge x={} y0={} y1={} align={} valid={}>'.format(
|
return '<TextEdge x={} y0={} y1={} align={} valid={}>'.format(
|
||||||
round(self.x, 2), round(self.y0, 2), round(self.y1, 2), self.align, self.is_valid)
|
round(self.x, 2), round(self.y0, 2), round(self.y1, 2), self.align, self.is_valid)
|
||||||
|
|
||||||
def update_coords(self, x, y0):
|
def update_coords(self, x, y0, edge_tol=50):
|
||||||
"""Updates the text edge's x and bottom y coordinates and sets
|
"""Updates the text edge's x and bottom y coordinates and sets
|
||||||
the is_valid attribute.
|
the is_valid attribute.
|
||||||
"""
|
"""
|
||||||
if np.isclose(self.y0, y0, atol=TEXTEDGE_EXTEND_TOLERANCE):
|
if np.isclose(self.y0, y0, atol=edge_tol):
|
||||||
self.x = (self.intersections * self.x + x) / float(self.intersections + 1)
|
self.x = (self.intersections * self.x + x) / float(self.intersections + 1)
|
||||||
self.y0 = y0
|
self.y0 = y0
|
||||||
self.intersections += 1
|
self.intersections += 1
|
||||||
|
|
@ -74,7 +72,8 @@ class TextEdges(object):
|
||||||
the PDF page. The dict has three keys based on the alignments,
|
the PDF page. The dict has three keys based on the alignments,
|
||||||
and each key's value is a list of camelot.core.TextEdge objects.
|
and each key's value is a list of camelot.core.TextEdge objects.
|
||||||
"""
|
"""
|
||||||
def __init__(self):
|
def __init__(self, edge_tol=50):
|
||||||
|
self.edge_tol = edge_tol
|
||||||
self._textedges = {'left': [], 'right': [], 'middle': []}
|
self._textedges = {'left': [], 'right': [], 'middle': []}
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
|
@ -115,7 +114,8 @@ class TextEdges(object):
|
||||||
if idx is None:
|
if idx is None:
|
||||||
self.add(textline, align)
|
self.add(textline, align)
|
||||||
else:
|
else:
|
||||||
self._textedges[align][idx].update_coords(x_coord, textline.y0)
|
self._textedges[align][idx].update_coords(
|
||||||
|
x_coord, textline.y0, edge_tol=self.edge_tol)
|
||||||
|
|
||||||
def generate(self, textlines):
|
def generate(self, textlines):
|
||||||
"""Generates the text edges dict based on horizontal text
|
"""Generates the text edges dict based on horizontal text
|
||||||
|
|
@ -359,7 +359,7 @@ class Table(object):
|
||||||
cell.left = cell.right = cell.top = cell.bottom = True
|
cell.left = cell.right = cell.top = cell.bottom = True
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def set_edges(self, vertical, horizontal, joint_close_tol=2):
|
def set_edges(self, vertical, horizontal, joint_tol=2):
|
||||||
"""Sets a cell's edges to True depending on whether the cell's
|
"""Sets a cell's edges to True depending on whether the cell's
|
||||||
coordinates overlap with the line's coordinates within a
|
coordinates overlap with the line's coordinates within a
|
||||||
tolerance.
|
tolerance.
|
||||||
|
|
@ -376,11 +376,11 @@ class Table(object):
|
||||||
# find closest x coord
|
# find closest x coord
|
||||||
# iterate over y coords and find closest start and end points
|
# iterate over y coords and find closest start and end points
|
||||||
i = [i for i, t in enumerate(self.cols)
|
i = [i for i, t in enumerate(self.cols)
|
||||||
if np.isclose(v[0], t[0], atol=joint_close_tol)]
|
if np.isclose(v[0], t[0], atol=joint_tol)]
|
||||||
j = [j for j, t in enumerate(self.rows)
|
j = [j for j, t in enumerate(self.rows)
|
||||||
if np.isclose(v[3], t[0], atol=joint_close_tol)]
|
if np.isclose(v[3], t[0], atol=joint_tol)]
|
||||||
k = [k for k, t in enumerate(self.rows)
|
k = [k for k, t in enumerate(self.rows)
|
||||||
if np.isclose(v[1], t[0], atol=joint_close_tol)]
|
if np.isclose(v[1], t[0], atol=joint_tol)]
|
||||||
if not j:
|
if not j:
|
||||||
continue
|
continue
|
||||||
J = j[0]
|
J = j[0]
|
||||||
|
|
@ -427,11 +427,11 @@ class Table(object):
|
||||||
# find closest y coord
|
# find closest y coord
|
||||||
# iterate over x coords and find closest start and end points
|
# iterate over x coords and find closest start and end points
|
||||||
i = [i for i, t in enumerate(self.rows)
|
i = [i for i, t in enumerate(self.rows)
|
||||||
if np.isclose(h[1], t[0], atol=joint_close_tol)]
|
if np.isclose(h[1], t[0], atol=joint_tol)]
|
||||||
j = [j for j, t in enumerate(self.cols)
|
j = [j for j, t in enumerate(self.cols)
|
||||||
if np.isclose(h[0], t[0], atol=joint_close_tol)]
|
if np.isclose(h[0], t[0], atol=joint_tol)]
|
||||||
k = [k for k, t in enumerate(self.cols)
|
k = [k for k, t in enumerate(self.cols)
|
||||||
if np.isclose(h[2], t[0], atol=joint_close_tol)]
|
if np.isclose(h[2], t[0], atol=joint_tol)]
|
||||||
if not j:
|
if not j:
|
||||||
continue
|
continue
|
||||||
J = j[0]
|
J = j[0]
|
||||||
|
|
|
||||||
|
|
@ -125,7 +125,7 @@ class PDFHandler(object):
|
||||||
with open(fpath, 'wb') as f:
|
with open(fpath, 'wb') as f:
|
||||||
outfile.write(f)
|
outfile.write(f)
|
||||||
|
|
||||||
def parse(self, flavor='lattice', suppress_stdout=False, **kwargs):
|
def parse(self, flavor='lattice', suppress_stdout=False, layout_kwargs={}, **kwargs):
|
||||||
"""Extracts tables by calling parser.get_tables on all single
|
"""Extracts tables by calling parser.get_tables on all single
|
||||||
page PDFs.
|
page PDFs.
|
||||||
|
|
||||||
|
|
@ -136,6 +136,8 @@ class PDFHandler(object):
|
||||||
Lattice is used by default.
|
Lattice is used by default.
|
||||||
suppress_stdout : str (default: False)
|
suppress_stdout : str (default: False)
|
||||||
Suppress logs and warnings.
|
Suppress logs and warnings.
|
||||||
|
layout_kwargs : dict, optional (default: {})
|
||||||
|
A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
|
||||||
kwargs : dict
|
kwargs : dict
|
||||||
See camelot.read_pdf kwargs.
|
See camelot.read_pdf kwargs.
|
||||||
|
|
||||||
|
|
@ -153,6 +155,7 @@ class PDFHandler(object):
|
||||||
for p in self.pages]
|
for p in self.pages]
|
||||||
parser = Lattice(**kwargs) if flavor == 'lattice' else Stream(**kwargs)
|
parser = Lattice(**kwargs) if flavor == 'lattice' else Stream(**kwargs)
|
||||||
for p in pages:
|
for p in pages:
|
||||||
t = parser.extract_tables(p, suppress_stdout=suppress_stdout)
|
t = parser.extract_tables(p, suppress_stdout=suppress_stdout,
|
||||||
|
layout_kwargs=layout_kwargs)
|
||||||
tables.extend(t)
|
tables.extend(t)
|
||||||
return TableList(tables)
|
return TableList(tables)
|
||||||
|
|
|
||||||
|
|
@ -6,7 +6,7 @@ from .utils import validate_input, remove_extra
|
||||||
|
|
||||||
|
|
||||||
def read_pdf(filepath, pages='1', password=None, flavor='lattice',
|
def read_pdf(filepath, pages='1', password=None, flavor='lattice',
|
||||||
suppress_stdout=False, **kwargs):
|
suppress_stdout=False, layout_kwargs={}, **kwargs):
|
||||||
"""Read PDF and return extracted tables.
|
"""Read PDF and return extracted tables.
|
||||||
|
|
||||||
Note: kwargs annotated with ^ can only be used with flavor='stream'
|
Note: kwargs annotated with ^ can only be used with flavor='stream'
|
||||||
|
|
@ -26,6 +26,8 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice',
|
||||||
Lattice is used by default.
|
Lattice is used by default.
|
||||||
suppress_stdout : bool, optional (default: True)
|
suppress_stdout : bool, optional (default: True)
|
||||||
Print all logs and warnings.
|
Print all logs and warnings.
|
||||||
|
layout_kwargs : dict, optional (default: {})
|
||||||
|
A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
|
||||||
table_areas : list, optional (default: None)
|
table_areas : list, optional (default: None)
|
||||||
List of table area strings of the form x1,y1,x2,y2
|
List of table area strings of the form x1,y1,x2,y2
|
||||||
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
|
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
|
||||||
|
|
@ -38,10 +40,13 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice',
|
||||||
flag_size : bool, optional (default: False)
|
flag_size : bool, optional (default: False)
|
||||||
Flag text based on font size. Useful to detect
|
Flag text based on font size. Useful to detect
|
||||||
super/subscripts. Adds <s></s> around flagged text.
|
super/subscripts. Adds <s></s> around flagged text.
|
||||||
row_close_tol^ : int, optional (default: 2)
|
strip_text : str, optional (default: '')
|
||||||
|
Characters that should be stripped from a string before
|
||||||
|
assigning it to a cell.
|
||||||
|
row_tol^ : int, optional (default: 2)
|
||||||
Tolerance parameter used to combine text vertically,
|
Tolerance parameter used to combine text vertically,
|
||||||
to generate rows.
|
to generate rows.
|
||||||
col_close_tol^ : int, optional (default: 0)
|
column_tol^ : int, optional (default: 0)
|
||||||
Tolerance parameter used to combine text horizontally,
|
Tolerance parameter used to combine text horizontally,
|
||||||
to generate columns.
|
to generate columns.
|
||||||
process_background* : bool, optional (default: False)
|
process_background* : bool, optional (default: False)
|
||||||
|
|
@ -57,10 +62,10 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice',
|
||||||
shift_text* : list, optional (default: ['l', 't'])
|
shift_text* : list, optional (default: ['l', 't'])
|
||||||
{'l', 'r', 't', 'b'}
|
{'l', 'r', 't', 'b'}
|
||||||
Direction in which text in a spanning cell will flow.
|
Direction in which text in a spanning cell will flow.
|
||||||
line_close_tol* : int, optional (default: 2)
|
line_tol* : int, optional (default: 2)
|
||||||
Tolerance parameter used to merge close vertical and horizontal
|
Tolerance parameter used to merge close vertical and horizontal
|
||||||
lines.
|
lines.
|
||||||
joint_close_tol* : int, optional (default: 2)
|
joint_tol* : int, optional (default: 2)
|
||||||
Tolerance parameter used to decide whether the detected lines
|
Tolerance parameter used to decide whether the detected lines
|
||||||
and points lie close to each other.
|
and points lie close to each other.
|
||||||
threshold_blocksize* : int, optional (default: 15)
|
threshold_blocksize* : int, optional (default: 15)
|
||||||
|
|
@ -77,10 +82,8 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice',
|
||||||
Number of times for erosion/dilation is applied.
|
Number of times for erosion/dilation is applied.
|
||||||
|
|
||||||
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
|
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
|
||||||
margins : tuple
|
resolution* : int, optional (default: 300)
|
||||||
PDFMiner char_margin, line_margin and word_margin.
|
Resolution used for PDF to PNG conversion.
|
||||||
|
|
||||||
For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
|
|
@ -98,5 +101,6 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice',
|
||||||
validate_input(kwargs, flavor=flavor)
|
validate_input(kwargs, flavor=flavor)
|
||||||
p = PDFHandler(filepath, pages=pages, password=password)
|
p = PDFHandler(filepath, pages=pages, password=password)
|
||||||
kwargs = remove_extra(kwargs, flavor=flavor)
|
kwargs = remove_extra(kwargs, flavor=flavor)
|
||||||
tables = p.parse(flavor=flavor, suppress_stdout=suppress_stdout, **kwargs)
|
tables = p.parse(flavor=flavor, suppress_stdout=suppress_stdout,
|
||||||
|
layout_kwargs=layout_kwargs, **kwargs)
|
||||||
return tables
|
return tables
|
||||||
|
|
|
||||||
|
|
@ -8,13 +8,11 @@ from ..utils import get_page_layout, get_text_objects
|
||||||
class BaseParser(object):
|
class BaseParser(object):
|
||||||
"""Defines a base parser.
|
"""Defines a base parser.
|
||||||
"""
|
"""
|
||||||
def _generate_layout(self, filename):
|
def _generate_layout(self, filename, layout_kwargs):
|
||||||
self.filename = filename
|
self.filename = filename
|
||||||
|
self.layout_kwargs = layout_kwargs
|
||||||
self.layout, self.dimensions = get_page_layout(
|
self.layout, self.dimensions = get_page_layout(
|
||||||
self.filename,
|
filename, **layout_kwargs)
|
||||||
char_margin=self.char_margin,
|
|
||||||
line_margin=self.line_margin,
|
|
||||||
word_margin=self.word_margin)
|
|
||||||
self.horizontal_text = get_text_objects(self.layout, ltype="lh")
|
self.horizontal_text = get_text_objects(self.layout, ltype="lh")
|
||||||
self.vertical_text = get_text_objects(self.layout, ltype="lv")
|
self.vertical_text = get_text_objects(self.layout, ltype="lv")
|
||||||
self.pdf_width, self.pdf_height = self.dimensions
|
self.pdf_width, self.pdf_height = self.dimensions
|
||||||
|
|
|
||||||
|
|
@ -53,10 +53,13 @@ class Lattice(BaseParser):
|
||||||
flag_size : bool, optional (default: False)
|
flag_size : bool, optional (default: False)
|
||||||
Flag text based on font size. Useful to detect
|
Flag text based on font size. Useful to detect
|
||||||
super/subscripts. Adds <s></s> around flagged text.
|
super/subscripts. Adds <s></s> around flagged text.
|
||||||
line_close_tol : int, optional (default: 2)
|
strip_text : str, optional (default: '')
|
||||||
|
Characters that should be stripped from a string before
|
||||||
|
assigning it to a cell.
|
||||||
|
line_tol : int, optional (default: 2)
|
||||||
Tolerance parameter used to merge close vertical and horizontal
|
Tolerance parameter used to merge close vertical and horizontal
|
||||||
lines.
|
lines.
|
||||||
joint_close_tol : int, optional (default: 2)
|
joint_tol : int, optional (default: 2)
|
||||||
Tolerance parameter used to decide whether the detected lines
|
Tolerance parameter used to decide whether the detected lines
|
||||||
and points lie close to each other.
|
and points lie close to each other.
|
||||||
threshold_blocksize : int, optional (default: 15)
|
threshold_blocksize : int, optional (default: 15)
|
||||||
|
|
@ -73,17 +76,15 @@ class Lattice(BaseParser):
|
||||||
Number of times for erosion/dilation is applied.
|
Number of times for erosion/dilation is applied.
|
||||||
|
|
||||||
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
|
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
|
||||||
margins : tuple
|
resolution : int, optional (default: 300)
|
||||||
PDFMiner char_margin, line_margin and word_margin.
|
Resolution used for PDF to PNG conversion.
|
||||||
|
|
||||||
For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, table_areas=None, process_background=False,
|
def __init__(self, table_areas=None, process_background=False,
|
||||||
line_size_scaling=15, copy_text=None, shift_text=['l', 't'],
|
line_size_scaling=15, copy_text=None, shift_text=['l', 't'],
|
||||||
split_text=False, flag_size=False, line_close_tol=2,
|
split_text=False, flag_size=False, strip_text='', line_tol=2,
|
||||||
joint_close_tol=2, threshold_blocksize=15, threshold_constant=-2,
|
joint_tol=2, threshold_blocksize=15, threshold_constant=-2,
|
||||||
iterations=0, margins=(1.0, 0.5, 0.1), **kwargs):
|
iterations=0, resolution=300, **kwargs):
|
||||||
self.table_areas = table_areas
|
self.table_areas = table_areas
|
||||||
self.process_background = process_background
|
self.process_background = process_background
|
||||||
self.line_size_scaling = line_size_scaling
|
self.line_size_scaling = line_size_scaling
|
||||||
|
|
@ -91,12 +92,13 @@ class Lattice(BaseParser):
|
||||||
self.shift_text = shift_text
|
self.shift_text = shift_text
|
||||||
self.split_text = split_text
|
self.split_text = split_text
|
||||||
self.flag_size = flag_size
|
self.flag_size = flag_size
|
||||||
self.line_close_tol = line_close_tol
|
self.strip_text = strip_text
|
||||||
self.joint_close_tol = joint_close_tol
|
self.line_tol = line_tol
|
||||||
|
self.joint_tol = joint_tol
|
||||||
self.threshold_blocksize = threshold_blocksize
|
self.threshold_blocksize = threshold_blocksize
|
||||||
self.threshold_constant = threshold_constant
|
self.threshold_constant = threshold_constant
|
||||||
self.iterations = iterations
|
self.iterations = iterations
|
||||||
self.char_margin, self.line_margin, self.word_margin = margins
|
self.resolution = resolution
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _reduce_index(t, idx, shift_text):
|
def _reduce_index(t, idx, shift_text):
|
||||||
|
|
@ -245,9 +247,9 @@ class Lattice(BaseParser):
|
||||||
rows.extend([tk[1], tk[3]])
|
rows.extend([tk[1], tk[3]])
|
||||||
# sort horizontal and vertical segments
|
# sort horizontal and vertical segments
|
||||||
cols = merge_close_lines(
|
cols = merge_close_lines(
|
||||||
sorted(cols), line_close_tol=self.line_close_tol)
|
sorted(cols), line_tol=self.line_tol)
|
||||||
rows = merge_close_lines(
|
rows = merge_close_lines(
|
||||||
sorted(rows, reverse=True), line_close_tol=self.line_close_tol)
|
sorted(rows, reverse=True), line_tol=self.line_tol)
|
||||||
# make grid using x and y coord of shortlisted rows and cols
|
# make grid using x and y coord of shortlisted rows and cols
|
||||||
cols = [(cols[i], cols[i + 1])
|
cols = [(cols[i], cols[i + 1])
|
||||||
for i in range(0, len(cols) - 1)]
|
for i in range(0, len(cols) - 1)]
|
||||||
|
|
@ -264,7 +266,7 @@ class Lattice(BaseParser):
|
||||||
|
|
||||||
table = Table(cols, rows)
|
table = Table(cols, rows)
|
||||||
# set table edges to True using ver+hor lines
|
# set table edges to True using ver+hor lines
|
||||||
table = table.set_edges(v_s, h_s, joint_close_tol=self.joint_close_tol)
|
table = table.set_edges(v_s, h_s, joint_tol=self.joint_tol)
|
||||||
# set table border edges to True
|
# set table border edges to True
|
||||||
table = table.set_border()
|
table = table.set_border()
|
||||||
# set spanning cells to True
|
# set spanning cells to True
|
||||||
|
|
@ -277,7 +279,7 @@ class Lattice(BaseParser):
|
||||||
for t in self.t_bbox[direction]:
|
for t in self.t_bbox[direction]:
|
||||||
indices, error = get_table_index(
|
indices, error = get_table_index(
|
||||||
table, t, direction, split_text=self.split_text,
|
table, t, direction, split_text=self.split_text,
|
||||||
flag_size=self.flag_size)
|
flag_size=self.flag_size, strip_text=self.strip_text)
|
||||||
if indices[:2] != (-1, -1):
|
if indices[:2] != (-1, -1):
|
||||||
pos_errors.append(error)
|
pos_errors.append(error)
|
||||||
indices = Lattice._reduce_index(table, indices, shift_text=self.shift_text)
|
indices = Lattice._reduce_index(table, indices, shift_text=self.shift_text)
|
||||||
|
|
@ -310,8 +312,8 @@ class Lattice(BaseParser):
|
||||||
|
|
||||||
return table
|
return table
|
||||||
|
|
||||||
def extract_tables(self, filename, suppress_stdout=False):
|
def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}):
|
||||||
self._generate_layout(filename)
|
self._generate_layout(filename, layout_kwargs)
|
||||||
if not suppress_stdout:
|
if not suppress_stdout:
|
||||||
logger.info('Processing {}'.format(os.path.basename(self.rootname)))
|
logger.info('Processing {}'.format(os.path.basename(self.rootname)))
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -38,29 +38,31 @@ class Stream(BaseParser):
|
||||||
flag_size : bool, optional (default: False)
|
flag_size : bool, optional (default: False)
|
||||||
Flag text based on font size. Useful to detect
|
Flag text based on font size. Useful to detect
|
||||||
super/subscripts. Adds <s></s> around flagged text.
|
super/subscripts. Adds <s></s> around flagged text.
|
||||||
row_close_tol : int, optional (default: 2)
|
strip_text : str, optional (default: '')
|
||||||
|
Characters that should be stripped from a string before
|
||||||
|
assigning it to a cell.
|
||||||
|
edge_tol : int, optional (default: 50)
|
||||||
|
Tolerance parameter for extending textedges vertically.
|
||||||
|
row_tol : int, optional (default: 2)
|
||||||
Tolerance parameter used to combine text vertically,
|
Tolerance parameter used to combine text vertically,
|
||||||
to generate rows.
|
to generate rows.
|
||||||
col_close_tol : int, optional (default: 0)
|
column_tol : int, optional (default: 0)
|
||||||
Tolerance parameter used to combine text horizontally,
|
Tolerance parameter used to combine text horizontally,
|
||||||
to generate columns.
|
to generate columns.
|
||||||
margins : tuple, optional (default: (1.0, 0.5, 0.1))
|
|
||||||
PDFMiner char_margin, line_margin and word_margin.
|
|
||||||
|
|
||||||
For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, table_areas=None, columns=None, split_text=False,
|
def __init__(self, table_areas=None, columns=None, split_text=False,
|
||||||
flag_size=False, row_close_tol=2, col_close_tol=0,
|
flag_size=False, strip_text='', edge_tol=50, row_tol=2,
|
||||||
margins=(1.0, 0.5, 0.1), **kwargs):
|
column_tol=0, **kwargs):
|
||||||
self.table_areas = table_areas
|
self.table_areas = table_areas
|
||||||
self.columns = columns
|
self.columns = columns
|
||||||
self._validate_columns()
|
self._validate_columns()
|
||||||
self.split_text = split_text
|
self.split_text = split_text
|
||||||
self.flag_size = flag_size
|
self.flag_size = flag_size
|
||||||
self.row_close_tol = row_close_tol
|
self.strip_text = strip_text
|
||||||
self.col_close_tol = col_close_tol
|
self.edge_tol = edge_tol
|
||||||
self.char_margin, self.line_margin, self.word_margin = margins
|
self.row_tol = row_tol
|
||||||
|
self.column_tol = column_tol
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _text_bbox(t_bbox):
|
def _text_bbox(t_bbox):
|
||||||
|
|
@ -86,7 +88,7 @@ class Stream(BaseParser):
|
||||||
return text_bbox
|
return text_bbox
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _group_rows(text, row_close_tol=2):
|
def _group_rows(text, row_tol=2):
|
||||||
"""Groups PDFMiner text objects into rows vertically
|
"""Groups PDFMiner text objects into rows vertically
|
||||||
within a tolerance.
|
within a tolerance.
|
||||||
|
|
||||||
|
|
@ -94,7 +96,7 @@ class Stream(BaseParser):
|
||||||
----------
|
----------
|
||||||
text : list
|
text : list
|
||||||
List of PDFMiner text objects.
|
List of PDFMiner text objects.
|
||||||
row_close_tol : int, optional (default: 2)
|
row_tol : int, optional (default: 2)
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
|
|
@ -110,7 +112,7 @@ class Stream(BaseParser):
|
||||||
# if t.get_text().strip() and all([obj.upright for obj in t._objs if
|
# if t.get_text().strip() and all([obj.upright for obj in t._objs if
|
||||||
# type(obj) is LTChar]):
|
# type(obj) is LTChar]):
|
||||||
if t.get_text().strip():
|
if t.get_text().strip():
|
||||||
if not np.isclose(row_y, t.y0, atol=row_close_tol):
|
if not np.isclose(row_y, t.y0, atol=row_tol):
|
||||||
rows.append(sorted(temp, key=lambda t: t.x0))
|
rows.append(sorted(temp, key=lambda t: t.x0))
|
||||||
temp = []
|
temp = []
|
||||||
row_y = t.y0
|
row_y = t.y0
|
||||||
|
|
@ -120,7 +122,7 @@ class Stream(BaseParser):
|
||||||
return rows
|
return rows
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _merge_columns(l, col_close_tol=0):
|
def _merge_columns(l, column_tol=0):
|
||||||
"""Merges column boundaries horizontally if they overlap
|
"""Merges column boundaries horizontally if they overlap
|
||||||
or lie within a tolerance.
|
or lie within a tolerance.
|
||||||
|
|
||||||
|
|
@ -128,7 +130,7 @@ class Stream(BaseParser):
|
||||||
----------
|
----------
|
||||||
l : list
|
l : list
|
||||||
List of column x-coordinate tuples.
|
List of column x-coordinate tuples.
|
||||||
col_close_tol : int, optional (default: 0)
|
column_tol : int, optional (default: 0)
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
|
|
@ -142,17 +144,17 @@ class Stream(BaseParser):
|
||||||
merged.append(higher)
|
merged.append(higher)
|
||||||
else:
|
else:
|
||||||
lower = merged[-1]
|
lower = merged[-1]
|
||||||
if col_close_tol >= 0:
|
if column_tol >= 0:
|
||||||
if (higher[0] <= lower[1] or
|
if (higher[0] <= lower[1] or
|
||||||
np.isclose(higher[0], lower[1], atol=col_close_tol)):
|
np.isclose(higher[0], lower[1], atol=column_tol)):
|
||||||
upper_bound = max(lower[1], higher[1])
|
upper_bound = max(lower[1], higher[1])
|
||||||
lower_bound = min(lower[0], higher[0])
|
lower_bound = min(lower[0], higher[0])
|
||||||
merged[-1] = (lower_bound, upper_bound)
|
merged[-1] = (lower_bound, upper_bound)
|
||||||
else:
|
else:
|
||||||
merged.append(higher)
|
merged.append(higher)
|
||||||
elif col_close_tol < 0:
|
elif column_tol < 0:
|
||||||
if higher[0] <= lower[1]:
|
if higher[0] <= lower[1]:
|
||||||
if np.isclose(higher[0], lower[1], atol=abs(col_close_tol)):
|
if np.isclose(higher[0], lower[1], atol=abs(column_tol)):
|
||||||
merged.append(higher)
|
merged.append(higher)
|
||||||
else:
|
else:
|
||||||
upper_bound = max(lower[1], higher[1])
|
upper_bound = max(lower[1], higher[1])
|
||||||
|
|
@ -189,7 +191,7 @@ class Stream(BaseParser):
|
||||||
return rows
|
return rows
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _add_columns(cols, text, row_close_tol):
|
def _add_columns(cols, text, row_tol):
|
||||||
"""Adds columns to existing list by taking into account
|
"""Adds columns to existing list by taking into account
|
||||||
the text that lies outside the current column x-coordinates.
|
the text that lies outside the current column x-coordinates.
|
||||||
|
|
||||||
|
|
@ -208,7 +210,7 @@ class Stream(BaseParser):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
if text:
|
if text:
|
||||||
text = Stream._group_rows(text, row_close_tol=row_close_tol)
|
text = Stream._group_rows(text, row_tol=row_tol)
|
||||||
elements = [len(r) for r in text]
|
elements = [len(r) for r in text]
|
||||||
new_cols = [(t.x0, t.x1)
|
new_cols = [(t.x0, t.x1)
|
||||||
for r in text if len(r) == max(elements) for t in r]
|
for r in text if len(r) == max(elements) for t in r]
|
||||||
|
|
@ -254,11 +256,10 @@ class Stream(BaseParser):
|
||||||
Assumes that tables are situated relatively far apart
|
Assumes that tables are situated relatively far apart
|
||||||
vertically.
|
vertically.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# TODO: add support for arabic text #141
|
# TODO: add support for arabic text #141
|
||||||
# sort textlines in reading order
|
# sort textlines in reading order
|
||||||
textlines.sort(key=lambda x: (-x.y0, x.x0))
|
textlines.sort(key=lambda x: (-x.y0, x.x0))
|
||||||
textedges = TextEdges()
|
textedges = TextEdges(edge_tol=self.edge_tol)
|
||||||
# generate left, middle and right textedges
|
# generate left, middle and right textedges
|
||||||
textedges.generate(textlines)
|
textedges.generate(textlines)
|
||||||
# select relevant edges
|
# select relevant edges
|
||||||
|
|
@ -300,7 +301,7 @@ class Stream(BaseParser):
|
||||||
self.t_bbox = t_bbox
|
self.t_bbox = t_bbox
|
||||||
|
|
||||||
text_x_min, text_y_min, text_x_max, text_y_max = self._text_bbox(self.t_bbox)
|
text_x_min, text_y_min, text_x_max, text_y_max = self._text_bbox(self.t_bbox)
|
||||||
rows_grouped = self._group_rows(self.t_bbox['horizontal'], row_close_tol=self.row_close_tol)
|
rows_grouped = self._group_rows(self.t_bbox['horizontal'], row_tol=self.row_tol)
|
||||||
rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
|
rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
|
||||||
elements = [len(r) for r in rows_grouped]
|
elements = [len(r) for r in rows_grouped]
|
||||||
|
|
||||||
|
|
@ -331,7 +332,7 @@ class Stream(BaseParser):
|
||||||
warnings.warn("No tables found in table area {}".format(
|
warnings.warn("No tables found in table area {}".format(
|
||||||
table_idx + 1))
|
table_idx + 1))
|
||||||
cols = [(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r]
|
cols = [(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r]
|
||||||
cols = self._merge_columns(sorted(cols), col_close_tol=self.col_close_tol)
|
cols = self._merge_columns(sorted(cols), column_tol=self.column_tol)
|
||||||
inner_text = []
|
inner_text = []
|
||||||
for i in range(1, len(cols)):
|
for i in range(1, len(cols)):
|
||||||
left = cols[i - 1][1]
|
left = cols[i - 1][1]
|
||||||
|
|
@ -343,7 +344,7 @@ class Stream(BaseParser):
|
||||||
for t in self.t_bbox[direction]
|
for t in self.t_bbox[direction]
|
||||||
if t.x0 > cols[-1][1] or t.x1 < cols[0][0]]
|
if t.x0 > cols[-1][1] or t.x1 < cols[0][0]]
|
||||||
inner_text.extend(outer_text)
|
inner_text.extend(outer_text)
|
||||||
cols = self._add_columns(cols, inner_text, self.row_close_tol)
|
cols = self._add_columns(cols, inner_text, self.row_tol)
|
||||||
cols = self._join_columns(cols, text_x_min, text_x_max)
|
cols = self._join_columns(cols, text_x_min, text_x_max)
|
||||||
|
|
||||||
return cols, rows
|
return cols, rows
|
||||||
|
|
@ -359,7 +360,7 @@ class Stream(BaseParser):
|
||||||
for t in self.t_bbox[direction]:
|
for t in self.t_bbox[direction]:
|
||||||
indices, error = get_table_index(
|
indices, error = get_table_index(
|
||||||
table, t, direction, split_text=self.split_text,
|
table, t, direction, split_text=self.split_text,
|
||||||
flag_size=self.flag_size)
|
flag_size=self.flag_size, strip_text=self.strip_text)
|
||||||
if indices[:2] != (-1, -1):
|
if indices[:2] != (-1, -1):
|
||||||
pos_errors.append(error)
|
pos_errors.append(error)
|
||||||
for r_idx, c_idx, text in indices:
|
for r_idx, c_idx, text in indices:
|
||||||
|
|
@ -388,8 +389,8 @@ class Stream(BaseParser):
|
||||||
|
|
||||||
return table
|
return table
|
||||||
|
|
||||||
def extract_tables(self, filename, suppress_stdout=False):
|
def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}):
|
||||||
self._generate_layout(filename)
|
self._generate_layout(filename, layout_kwargs)
|
||||||
if not suppress_stdout:
|
if not suppress_stdout:
|
||||||
logger.info('Processing {}'.format(os.path.basename(self.rootname)))
|
logger.info('Processing {}'.format(os.path.basename(self.rootname)))
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -20,16 +20,16 @@ from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal,
|
||||||
|
|
||||||
stream_kwargs = [
|
stream_kwargs = [
|
||||||
'columns',
|
'columns',
|
||||||
'row_close_tol',
|
'row_tol',
|
||||||
'col_close_tol'
|
'column_tol'
|
||||||
]
|
]
|
||||||
lattice_kwargs = [
|
lattice_kwargs = [
|
||||||
'process_background',
|
'process_background',
|
||||||
'line_size_scaling',
|
'line_size_scaling',
|
||||||
'copy_text',
|
'copy_text',
|
||||||
'shift_text',
|
'shift_text',
|
||||||
'line_close_tol',
|
'line_tol',
|
||||||
'joint_close_tol',
|
'joint_tol',
|
||||||
'threshold_blocksize',
|
'threshold_blocksize',
|
||||||
'threshold_constant',
|
'threshold_constant',
|
||||||
'iterations'
|
'iterations'
|
||||||
|
|
@ -281,14 +281,14 @@ def text_in_bbox(bbox, text):
|
||||||
return t_bbox
|
return t_bbox
|
||||||
|
|
||||||
|
|
||||||
def merge_close_lines(ar, line_close_tol=2):
|
def merge_close_lines(ar, line_tol=2):
|
||||||
"""Merges lines which are within a tolerance by calculating a
|
"""Merges lines which are within a tolerance by calculating a
|
||||||
moving mean, based on their x or y axis projections.
|
moving mean, based on their x or y axis projections.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
ar : list
|
ar : list
|
||||||
line_close_tol : int, optional (default: 2)
|
line_tol : int, optional (default: 2)
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
|
|
@ -301,7 +301,7 @@ def merge_close_lines(ar, line_close_tol=2):
|
||||||
ret.append(a)
|
ret.append(a)
|
||||||
else:
|
else:
|
||||||
temp = ret[-1]
|
temp = ret[-1]
|
||||||
if np.isclose(temp, a, atol=line_close_tol):
|
if np.isclose(temp, a, atol=line_tol):
|
||||||
temp = (temp + a) / 2.0
|
temp = (temp + a) / 2.0
|
||||||
ret[-1] = temp
|
ret[-1] = temp
|
||||||
else:
|
else:
|
||||||
|
|
@ -309,7 +309,12 @@ def merge_close_lines(ar, line_close_tol=2):
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
|
|
||||||
def flag_font_size(textline, direction):
|
# TODO: combine the following functions into a TextProcessor class which
|
||||||
|
# applies corresponding transformations sequentially
|
||||||
|
# (inspired from sklearn.pipeline.Pipeline)
|
||||||
|
|
||||||
|
|
||||||
|
def flag_font_size(textline, direction, strip_text=''):
|
||||||
"""Flags super/subscripts in text by enclosing them with <s></s>.
|
"""Flags super/subscripts in text by enclosing them with <s></s>.
|
||||||
May give false positives.
|
May give false positives.
|
||||||
|
|
||||||
|
|
@ -319,6 +324,9 @@ def flag_font_size(textline, direction):
|
||||||
List of PDFMiner LTChar objects.
|
List of PDFMiner LTChar objects.
|
||||||
direction : string
|
direction : string
|
||||||
Direction of the PDFMiner LTTextLine object.
|
Direction of the PDFMiner LTTextLine object.
|
||||||
|
strip_text : str, optional (default: '')
|
||||||
|
Characters that should be stripped from a string before
|
||||||
|
assigning it to a cell.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
|
|
@ -344,13 +352,13 @@ def flag_font_size(textline, direction):
|
||||||
fchars = [t[0] for t in chars]
|
fchars = [t[0] for t in chars]
|
||||||
if ''.join(fchars).strip():
|
if ''.join(fchars).strip():
|
||||||
flist.append(''.join(fchars))
|
flist.append(''.join(fchars))
|
||||||
fstring = ''.join(flist)
|
fstring = ''.join(flist).strip(strip_text)
|
||||||
else:
|
else:
|
||||||
fstring = ''.join([t.get_text() for t in textline])
|
fstring = ''.join([t.get_text() for t in textline]).strip(strip_text)
|
||||||
return fstring
|
return fstring
|
||||||
|
|
||||||
|
|
||||||
def split_textline(table, textline, direction, flag_size=False):
|
def split_textline(table, textline, direction, flag_size=False, strip_text=''):
|
||||||
"""Splits PDFMiner LTTextLine into substrings if it spans across
|
"""Splits PDFMiner LTTextLine into substrings if it spans across
|
||||||
multiple rows/columns.
|
multiple rows/columns.
|
||||||
|
|
||||||
|
|
@ -365,6 +373,9 @@ def split_textline(table, textline, direction, flag_size=False):
|
||||||
Whether or not to highlight a substring using <s></s>
|
Whether or not to highlight a substring using <s></s>
|
||||||
if its size is different from rest of the string. (Useful for
|
if its size is different from rest of the string. (Useful for
|
||||||
super and subscripts.)
|
super and subscripts.)
|
||||||
|
strip_text : str, optional (default: '')
|
||||||
|
Characters that should be stripped from a string before
|
||||||
|
assigning it to a cell.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
|
|
@ -416,14 +427,15 @@ def split_textline(table, textline, direction, flag_size=False):
|
||||||
grouped_chars = []
|
grouped_chars = []
|
||||||
for key, chars in groupby(cut_text, itemgetter(0, 1)):
|
for key, chars in groupby(cut_text, itemgetter(0, 1)):
|
||||||
if flag_size:
|
if flag_size:
|
||||||
grouped_chars.append((key[0], key[1], flag_font_size([t[2] for t in chars], direction)))
|
grouped_chars.append((key[0], key[1],
|
||||||
|
flag_font_size([t[2] for t in chars], direction, strip_text=strip_text)))
|
||||||
else:
|
else:
|
||||||
gchars = [t[2].get_text() for t in chars]
|
gchars = [t[2].get_text() for t in chars]
|
||||||
grouped_chars.append((key[0], key[1], ''.join(gchars)))
|
grouped_chars.append((key[0], key[1], ''.join(gchars).strip(strip_text)))
|
||||||
return grouped_chars
|
return grouped_chars
|
||||||
|
|
||||||
|
|
||||||
def get_table_index(table, t, direction, split_text=False, flag_size=False):
|
def get_table_index(table, t, direction, split_text=False, flag_size=False, strip_text='',):
|
||||||
"""Gets indices of the table cell where given text object lies by
|
"""Gets indices of the table cell where given text object lies by
|
||||||
comparing their y and x-coordinates.
|
comparing their y and x-coordinates.
|
||||||
|
|
||||||
|
|
@ -441,6 +453,9 @@ def get_table_index(table, t, direction, split_text=False, flag_size=False):
|
||||||
Whether or not to highlight a substring using <s></s>
|
Whether or not to highlight a substring using <s></s>
|
||||||
if its size is different from rest of the string. (Useful for
|
if its size is different from rest of the string. (Useful for
|
||||||
super and subscripts)
|
super and subscripts)
|
||||||
|
strip_text : str, optional (default: '')
|
||||||
|
Characters that should be stripped from a string before
|
||||||
|
assigning it to a cell.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
|
|
@ -495,12 +510,12 @@ def get_table_index(table, t, direction, split_text=False, flag_size=False):
|
||||||
error = ((X * (y0_offset + y1_offset)) + (Y * (x0_offset + x1_offset))) / charea
|
error = ((X * (y0_offset + y1_offset)) + (Y * (x0_offset + x1_offset))) / charea
|
||||||
|
|
||||||
if split_text:
|
if split_text:
|
||||||
return split_textline(table, t, direction, flag_size=flag_size), error
|
return split_textline(table, t, direction, flag_size=flag_size, strip_text=strip_text), error
|
||||||
else:
|
else:
|
||||||
if flag_size:
|
if flag_size:
|
||||||
return [(r_idx, c_idx, flag_font_size(t._objs, direction))], error
|
return [(r_idx, c_idx, flag_font_size(t._objs, direction, strip_text=strip_text))], error
|
||||||
else:
|
else:
|
||||||
return [(r_idx, c_idx, t.get_text())], error
|
return [(r_idx, c_idx, t.get_text().strip(strip_text))], error
|
||||||
|
|
||||||
|
|
||||||
def compute_accuracy(error_weights):
|
def compute_accuracy(error_weights):
|
||||||
|
|
|
||||||
|
After Width: | Height: | Size: 20 KiB |
|
After Width: | Height: | Size: 20 KiB |
|
|
@ -316,10 +316,87 @@ You can solve this by passing ``flag_size=True``, which will enclose the supersc
|
||||||
"Madhya Pradesh","27.13","23.57","-","-","3.56","0.38","-","1.86","-","1.28"
|
"Madhya Pradesh","27.13","23.57","-","-","3.56","0.38","-","1.86","-","1.28"
|
||||||
"...","...","...","...","...","...","...","...","...","...","..."
|
"...","...","...","...","...","...","...","...","...","...","..."
|
||||||
|
|
||||||
Control how text is grouped into rows
|
Strip characters from text
|
||||||
-------------------------------------
|
--------------------------
|
||||||
|
|
||||||
You can pass ``row_close_tol=<+int>`` to group the rows closer together, as shown below.
|
You can strip unwanted characters like spaces, dots and newlines from a string using the ``strip_text`` keyword argument. Take a look at `this PDF <https://github.com/socialcopsdev/camelot/blob/master/tests/files/tabula/12s0324.pdf>`_ as an example, the text at the start of each row contains a lot of unwanted spaces, dots and newlines.
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
>>> tables = camelot.read_pdf('12s0324.pdf', flavor='stream', strip_text=' .\n')
|
||||||
|
>>> tables[0].df
|
||||||
|
|
||||||
|
.. tip::
|
||||||
|
Here's how you can do the same with the :ref:`command-line interface <cli>`.
|
||||||
|
::
|
||||||
|
|
||||||
|
$ camelot -strip ' .\n' stream 12s0324.pdf
|
||||||
|
|
||||||
|
.. csv-table::
|
||||||
|
|
||||||
|
"...","...","...","...","...","...","...","...","...","..."
|
||||||
|
"Forcible rape","17.5","2.6","14.9","17.2","2.5","14.7","–","–","–"
|
||||||
|
"Robbery","102.1","25.5","76.6","90.0","22.9","67.1","12.1","2.5","9.5"
|
||||||
|
"Aggravated assault","338.4","40.1","298.3","264.0","30.2","233.8","74.4","9.9","64.5"
|
||||||
|
"Property crime","1,396 .4","338 .7","1,057 .7","875 .9","210 .8","665 .1","608 .2","127 .9","392 .6"
|
||||||
|
"Burglary","240.9","60.3","180.6","205.0","53.4","151.7","35.9","6.9","29.0"
|
||||||
|
"...","...","...","...","...","...","...","...","...","..."
|
||||||
|
|
||||||
|
Improve guessed table areas
|
||||||
|
---------------------------
|
||||||
|
|
||||||
|
While using :ref:`Stream <stream>`, automatic table detection can fail for PDFs like `this one <https://github.com/socialcopsdev/camelot/blob/master/tests/files/edge_tol.pdf>`_. That's because the text is relatively far apart vertically, which can lead to shorter textedges being calculated.
|
||||||
|
|
||||||
|
.. note:: To know more about how textedges are calculated to guess table areas, you can see pages 20, 35 and 40 of `Anssi Nurminen's master's thesis <http://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3>`_.
|
||||||
|
|
||||||
|
Let's see the table area that is detected by default.
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
>>> tables = camelot.read_pdf('edge_tol.pdf', flavor='stream')
|
||||||
|
>>> camelot.plot(tables[0], kind='contour')
|
||||||
|
>>> plt.show()
|
||||||
|
|
||||||
|
.. tip::
|
||||||
|
Here's how you can do the same with the :ref:`command-line interface <cli>`.
|
||||||
|
::
|
||||||
|
|
||||||
|
$ camelot stream -plot contour edge.pdf
|
||||||
|
|
||||||
|
.. figure:: ../_static/png/edge_tol_1.png
|
||||||
|
:height: 674
|
||||||
|
:width: 1366
|
||||||
|
:scale: 50%
|
||||||
|
:alt: Table area with default edge_tol
|
||||||
|
:align: left
|
||||||
|
|
||||||
|
To improve the detected area, you can increase the ``edge_tol`` (default: 50) value to counter the effect of text being placed relatively far apart vertically. Larger ``edge_tol`` will lead to longer textedges being detected, leading to an improved guess of the table area. Let's use a value of 500.
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
>>> tables = camelot.read_pdf('edge_tol.pdf', flavor='stream', edge_tol=500)
|
||||||
|
>>> camelot.plot(tables[0], kind='contour')
|
||||||
|
>>> plt.show()
|
||||||
|
|
||||||
|
.. tip::
|
||||||
|
Here's how you can do the same with the :ref:`command-line interface <cli>`.
|
||||||
|
::
|
||||||
|
|
||||||
|
$ camelot stream -e 500 -plot contour edge.pdf
|
||||||
|
|
||||||
|
.. figure:: ../_static/png/edge_tol_2.png
|
||||||
|
:height: 674
|
||||||
|
:width: 1366
|
||||||
|
:scale: 50%
|
||||||
|
:alt: Table area with default edge_tol
|
||||||
|
:align: left
|
||||||
|
|
||||||
|
As you can see, the guessed table area has improved!
|
||||||
|
|
||||||
|
Improve guessed table rows
|
||||||
|
--------------------------
|
||||||
|
|
||||||
|
You can pass ``row_tol=<+int>`` to group the rows closer together, as shown below.
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
|
|
@ -337,7 +414,7 @@ You can pass ``row_close_tol=<+int>`` to group the rows closer together, as show
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
>>> tables = camelot.read_pdf('group_rows.pdf', flavor='stream', row_close_tol=10)
|
>>> tables = camelot.read_pdf('group_rows.pdf', flavor='stream', row_tol=10)
|
||||||
>>> tables[0].df
|
>>> tables[0].df
|
||||||
|
|
||||||
.. tip::
|
.. tip::
|
||||||
|
|
@ -524,3 +601,14 @@ We don't need anything else. Now, let's pass ``copy_text=['v']`` to copy text in
|
||||||
"4","West Bengal","West Medinipur","iv. Acute Diarrhoeal Disease","145","0","04/01/14","05/01/14","Under control","..."
|
"4","West Bengal","West Medinipur","iv. Acute Diarrhoeal Disease","145","0","04/01/14","05/01/14","Under control","..."
|
||||||
"4","West Bengal","Birbhum","v. Food Poisoning","199","0","31/12/13","31/12/13","Under control","..."
|
"4","West Bengal","Birbhum","v. Food Poisoning","199","0","31/12/13","31/12/13","Under control","..."
|
||||||
"4","West Bengal","Howrah","vi. Viral Hepatitis A &E","85","0","26/12/13","27/12/13","Under surveillance","..."
|
"4","West Bengal","Howrah","vi. Viral Hepatitis A &E","85","0","26/12/13","27/12/13","Under surveillance","..."
|
||||||
|
|
||||||
|
Tweak layout generation
|
||||||
|
-----------------------
|
||||||
|
|
||||||
|
Camelot is built on top of PDFMiner's functionality of grouping characters on a page into words and sentences. In some cases (such as `#170 <https://github.com/socialcopsdev/camelot/issues/170>`_ and `#215 <https://github.com/socialcopsdev/camelot/issues/215>`_), PDFMiner can group characters that should belong to the same sentence into separate sentences.
|
||||||
|
|
||||||
|
To deal with such cases, you can tweak PDFMiner's `LAParams kwargs <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ to improve layout generation, by passing the keyword arguments as a dict using ``layout_kwargs`` in :meth:`read_pdf() <camelot.read_pdf>`. To know more about the parameters you can tweak, you can check out `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
>>> tables = camelot.read_pdf('foo.pdf', layout_kwargs={'detect_vertical': False})
|
||||||
|
|
|
||||||
107
tests/data.py
|
|
@ -312,6 +312,63 @@ data_stream_flag_size = [
|
||||||
["ALL STATES", "513.38", "436.02", "-", "25.57", "51.06", "14.18", "-", "8.21", "11.83", "11.08"]
|
["ALL STATES", "513.38", "436.02", "-", "25.57", "51.06", "14.18", "-", "8.21", "11.83", "11.08"]
|
||||||
]
|
]
|
||||||
|
|
||||||
|
data_stream_strip_text = [
|
||||||
|
["V i n s a u Ve r r e", ""],
|
||||||
|
["Les Blancs", "12.5CL"],
|
||||||
|
["A.O.P Côtes du Rhône", ""],
|
||||||
|
["Domaine de la Guicharde « Autour de la chapelle » 2016", "8 €"],
|
||||||
|
["A.O.P Vacqueyras", ""],
|
||||||
|
["Domaine de Montvac « Melodine » 2016", "10 €"],
|
||||||
|
["A.O.P Châteauneuf du Pape", ""],
|
||||||
|
["Domaine de Beaurenard 2017", "13 €"],
|
||||||
|
["A.O.P Côteaux du Languedoc", ""],
|
||||||
|
["Villa Tempora « Un temps pour elle » 2014", "9 €"],
|
||||||
|
["A.O.P Côtes de Provence", ""],
|
||||||
|
["Château Grand Boise 2017", "9 €"],
|
||||||
|
["Les Rosés", "12,5 CL"],
|
||||||
|
["A.O.P Côtes du Rhône", ""],
|
||||||
|
["Domaine de la Florane « A fleur de Pampre » 2016", "8 €"],
|
||||||
|
["Famille Coulon (Domaine Beaurenard) Biotifulfox 2017", "8 €"],
|
||||||
|
["A.O.P Vacqueyras", ""],
|
||||||
|
["Domaine de Montvac 2017", "9 €"],
|
||||||
|
["A.O.P Languedoc", ""],
|
||||||
|
["Domaine de Joncas « Nébla » 2015", "8 €"],
|
||||||
|
["Villa Tempora « L’arroseur arrosé » 2015", "9 €"],
|
||||||
|
["A.O.P Côtes de Provence", ""],
|
||||||
|
["Château Grand Boise « Sainte Victoire » 2017", "9 €"],
|
||||||
|
["Château Léoube 2016", "10 €"]
|
||||||
|
]
|
||||||
|
|
||||||
|
data_stream_edge_tol = [
|
||||||
|
["Key figures", ""],
|
||||||
|
["", "2016"],
|
||||||
|
["(all amounts in EUR)", ""],
|
||||||
|
["C\nlass A", ""],
|
||||||
|
["N\net Asset Value at 31 December", "5,111,372"],
|
||||||
|
["N\number of outstanding units at 31 December", "49,136"],
|
||||||
|
["N\net Asset Value per unit at 31 December", "104.03"],
|
||||||
|
["C\nlass B", ""],
|
||||||
|
["N\net Asset Value at 31 December", "49,144,825"],
|
||||||
|
["N\number of outstanding units at 31 December", "471,555"],
|
||||||
|
["N\net Asset Value per unit at 31 December", "104.22"],
|
||||||
|
["T\notal for the Fund", ""],
|
||||||
|
["N\net Asset Value at 31 December", "54,256,197"],
|
||||||
|
["N\number of outstanding units at 31 December", "520,691"],
|
||||||
|
["I\nnvestment result", ""],
|
||||||
|
["Direct result", "-"],
|
||||||
|
["Revaluation", "2,076,667"],
|
||||||
|
["Costs", "(106,870)"],
|
||||||
|
["T\notal investment result for the period1", "1,969,797"],
|
||||||
|
["I\nnvestment result per unit2", ""],
|
||||||
|
["Direct result", "-"],
|
||||||
|
["Revaluation", "3.99"],
|
||||||
|
["Costs", "(0.21)"],
|
||||||
|
["T\notal investment result per unit", "3.78"],
|
||||||
|
["1 The results cover the period from inception of the Fund at 8 April 2016 through 31 December 2016.", ""],
|
||||||
|
["2 The result per unit is calculated using the total number of outstanding unit as per the end of the", ""],
|
||||||
|
["period.", ""]
|
||||||
|
]
|
||||||
|
|
||||||
data_lattice = [
|
data_lattice = [
|
||||||
["Cycle \nName", "KI \n(1/km)", "Distance \n(mi)", "Percent Fuel Savings", "", "", ""],
|
["Cycle \nName", "KI \n(1/km)", "Distance \n(mi)", "Percent Fuel Savings", "", "", ""],
|
||||||
["", "", "", "Improved \nSpeed", "Decreased \nAccel", "Eliminate \nStops", "Decreased \nIdle"],
|
["", "", "", "Improved \nSpeed", "Decreased \nAccel", "Eliminate \nStops", "Decreased \nIdle"],
|
||||||
|
|
@ -485,9 +542,49 @@ data_lattice_shift_text_right_bottom = [
|
||||||
]
|
]
|
||||||
|
|
||||||
data_arabic = [
|
data_arabic = [
|
||||||
['ً\n\xa0\nﺎﺒﺣﺮﻣ', 'ﻥﺎﻄﻠﺳ\xa0ﻲﻤﺳﺍ'],
|
["ً\n\xa0\nﺎﺒﺣﺮﻣ", "ﻥﺎﻄﻠﺳ\xa0ﻲﻤﺳﺍ"],
|
||||||
['ﻝﺎﻤﺸﻟﺍ\xa0ﺎﻨﻴﻟﻭﺭﺎﻛ\xa0ﺔﻳﻻﻭ\xa0ﻦﻣ\xa0ﺎﻧﺍ', '؟ﺖﻧﺍ\xa0ﻦﻳﺍ\xa0ﻦﻣ'],
|
["ﻝﺎﻤﺸﻟﺍ\xa0ﺎﻨﻴﻟﻭﺭﺎﻛ\xa0ﺔﻳﻻﻭ\xa0ﻦﻣ\xa0ﺎﻧﺍ", "؟ﺖﻧﺍ\xa0ﻦﻳﺍ\xa0ﻦﻣ"],
|
||||||
['1234', 'ﻂﻄﻗ\xa047\xa0ﻱﺪﻨﻋ'],
|
["1234", "ﻂﻄﻗ\xa047\xa0ﻱﺪﻨﻋ"],
|
||||||
['؟ﻙﺎﺒﺷ\xa0ﺖﻧﺍ\xa0ﻞﻫ', 'ﺔﻳﺰﻴﻠﺠﻧﻻﺍ\xa0ﻲﻓ\xa0Jeremy\xa0ﻲﻤﺳﺍ'],
|
["؟ﻙﺎﺒﺷ\xa0ﺖﻧﺍ\xa0ﻞﻫ", "ﺔﻳﺰﻴﻠﺠﻧﻻﺍ\xa0ﻲﻓ\xa0Jeremy\xa0ﻲﻤﺳﺍ"],
|
||||||
['Jeremy\xa0is\xa0ﻲﻣﺮﺟ\xa0in\xa0Arabic', '']
|
["Jeremy\xa0is\xa0ﻲﻣﺮﺟ\xa0in\xa0Arabic", ""]
|
||||||
|
]
|
||||||
|
|
||||||
|
data_stream_layout_kwargs = [
|
||||||
|
["V i n s a u Ve r r e", ""],
|
||||||
|
["Les Blancs", "12.5CL"],
|
||||||
|
["A.O.P Côtes du Rhône", ""],
|
||||||
|
["Domaine de la Guicharde « Autour de la chapelle » 2016", "8 €"],
|
||||||
|
["A.O.P Vacqueyras", ""],
|
||||||
|
["Domaine de Montvac « Melodine » 2016", "10 €"],
|
||||||
|
["A.O.P Châteauneuf du Pape", ""],
|
||||||
|
["Domaine de Beaurenard 2017", "13 €"],
|
||||||
|
["A.O.P Côteaux du Languedoc", ""],
|
||||||
|
["Villa Tempora « Un temps pour elle » 2014", "9 €"],
|
||||||
|
["A.O.P Côtes de Provence", ""],
|
||||||
|
["Château Grand Boise 2017", "9 €"],
|
||||||
|
["Les Rosés", "12,5 CL"],
|
||||||
|
["A.O.P Côtes du Rhône", ""],
|
||||||
|
["Domaine de la Florane « A fleur de Pampre » 2016", "8 €"],
|
||||||
|
["Famille Coulon (Domaine Beaurenard) Biotifulfox 2017", "8 €"],
|
||||||
|
["A.O.P Vacqueyras", ""],
|
||||||
|
["Domaine de Montvac 2017", "9 €"],
|
||||||
|
["A.O.P Languedoc", ""],
|
||||||
|
["Domaine de Joncas « Nébla » 2015", "8 €"],
|
||||||
|
["Villa Tempora « L’arroseur arrosé » 2015", "9 €"],
|
||||||
|
["A.O.P Côtes de Provence", ""],
|
||||||
|
["Château Grand Boise « Sainte Victoire » 2017", "9 €"],
|
||||||
|
["Château Léoube 2016", "10 €"],
|
||||||
|
["Les Rouges", "12,CL"],
|
||||||
|
["A.O.P Côtes du Rhône", ""],
|
||||||
|
["Domaine de Dionysos « La Cigalette »", "8 €"],
|
||||||
|
["Château Saint Estève d’Uchaux « Grande Réserve » 2014", "9 €"],
|
||||||
|
["Domaine de la Guicharde « Cuvée Massillan » 2016", "9 €"],
|
||||||
|
["Domaine de la Florane « Terre Pourpre » 2014", "10 €"],
|
||||||
|
["L’Oratoire St Martin « Réserve des Seigneurs » 2015", "11 €"],
|
||||||
|
["A.O.P Saint Joseph", ""],
|
||||||
|
["Domaine Monier Perréol « Châtelet » 2015", "13 €"],
|
||||||
|
["A.O.P Châteauneuf du Pape", ""],
|
||||||
|
["Domaine de Beaurenard 2011", "15 €"],
|
||||||
|
["A.O.P Cornas", ""],
|
||||||
|
["Domaine Lionnet « Terre Brûlée » 2012", "15 €"]
|
||||||
]
|
]
|
||||||
|
|
|
||||||
|
Before Width: | Height: | Size: 8.2 KiB After Width: | Height: | Size: 8.2 KiB |
|
Before Width: | Height: | Size: 35 KiB After Width: | Height: | Size: 48 KiB |
|
Before Width: | Height: | Size: 33 KiB After Width: | Height: | Size: 46 KiB |
|
Before Width: | Height: | Size: 6.6 KiB After Width: | Height: | Size: 6.7 KiB |
|
|
@ -81,7 +81,7 @@ def test_stream_columns():
|
||||||
|
|
||||||
filename = os.path.join(testdir, "mexican_towns.pdf")
|
filename = os.path.join(testdir, "mexican_towns.pdf")
|
||||||
tables = camelot.read_pdf(
|
tables = camelot.read_pdf(
|
||||||
filename, flavor="stream", columns=["67,180,230,425,475"], row_close_tol=10)
|
filename, flavor="stream", columns=["67,180,230,425,475"], row_tol=10)
|
||||||
assert df.equals(tables[0].df)
|
assert df.equals(tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -102,6 +102,31 @@ def test_stream_flag_size():
|
||||||
assert df.equals(tables[0].df)
|
assert df.equals(tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
|
def test_stream_strip_text():
|
||||||
|
df = pd.DataFrame(data_stream_strip_text)
|
||||||
|
|
||||||
|
filename = os.path.join(testdir, "detect_vertical_false.pdf")
|
||||||
|
tables = camelot.read_pdf(filename, flavor="stream", strip_text="\n")
|
||||||
|
assert df.equals(tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
|
def test_stream_edge_tol():
|
||||||
|
df = pd.DataFrame(data_stream_edge_tol)
|
||||||
|
|
||||||
|
filename = os.path.join(testdir, "edge_tol.pdf")
|
||||||
|
tables = camelot.read_pdf(filename, flavor="stream", edge_tol=500)
|
||||||
|
assert df.equals(tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
|
def test_stream_layout_kwargs():
|
||||||
|
df = pd.DataFrame(data_stream_layout_kwargs)
|
||||||
|
|
||||||
|
filename = os.path.join(testdir, "detect_vertical_false.pdf")
|
||||||
|
tables = camelot.read_pdf(
|
||||||
|
filename, flavor="stream", layout_kwargs={"detect_vertical": False})
|
||||||
|
assert df.equals(tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
def test_lattice():
|
def test_lattice():
|
||||||
df = pd.DataFrame(data_lattice)
|
df = pd.DataFrame(data_lattice)
|
||||||
|
|
||||||
|
|
@ -179,7 +204,7 @@ def test_repr():
|
||||||
tables = camelot.read_pdf(filename)
|
tables = camelot.read_pdf(filename)
|
||||||
assert repr(tables) == "<TableList n=1>"
|
assert repr(tables) == "<TableList n=1>"
|
||||||
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
||||||
assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.42 x2=164.64 y2=233.89>"
|
assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
|
||||||
|
|
||||||
|
|
||||||
def test_arabic():
|
def test_arabic():
|
||||||
|
|
|
||||||