Renames the keyword `table_area` to `table_areas` (#171)
`table_areas` sounds more apt since it is a list and there can be multiple table areas on a page. Closes #165pull/2/head
parent
8205e0e9ab
commit
32df09ad1c
|
|
@ -48,7 +48,7 @@ def cli(ctx, *args, **kwargs):
|
||||||
|
|
||||||
|
|
||||||
@cli.command('lattice')
|
@cli.command('lattice')
|
||||||
@click.option('-T', '--table_area', default=[], multiple=True,
|
@click.option('-T', '--table_areas', default=[], multiple=True,
|
||||||
help='Table areas to process. Example: x1,y1,x2,y2'
|
help='Table areas to process. Example: x1,y1,x2,y2'
|
||||||
' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
|
' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
|
||||||
@click.option('-back', '--process_background', is_flag=True,
|
@click.option('-back', '--process_background', is_flag=True,
|
||||||
|
|
@ -95,8 +95,8 @@ def lattice(c, *args, **kwargs):
|
||||||
filepath = kwargs.pop('filepath')
|
filepath = kwargs.pop('filepath')
|
||||||
kwargs.update(conf)
|
kwargs.update(conf)
|
||||||
|
|
||||||
table_area = list(kwargs['table_area'])
|
table_areas = list(kwargs['table_areas'])
|
||||||
kwargs['table_area'] = None if not table_area else table_area
|
kwargs['table_areas'] = None if not table_areas else table_areas
|
||||||
copy_text = list(kwargs['copy_text'])
|
copy_text = list(kwargs['copy_text'])
|
||||||
kwargs['copy_text'] = None if not copy_text else copy_text
|
kwargs['copy_text'] = None if not copy_text else copy_text
|
||||||
kwargs['shift_text'] = list(kwargs['shift_text'])
|
kwargs['shift_text'] = list(kwargs['shift_text'])
|
||||||
|
|
@ -116,7 +116,7 @@ def lattice(c, *args, **kwargs):
|
||||||
|
|
||||||
|
|
||||||
@cli.command('stream')
|
@cli.command('stream')
|
||||||
@click.option('-T', '--table_area', default=[], multiple=True,
|
@click.option('-T', '--table_areas', default=[], multiple=True,
|
||||||
help='Table areas to process. Example: x1,y1,x2,y2'
|
help='Table areas to process. Example: x1,y1,x2,y2'
|
||||||
' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
|
' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
|
||||||
@click.option('-C', '--columns', default=[], multiple=True,
|
@click.option('-C', '--columns', default=[], multiple=True,
|
||||||
|
|
@ -142,8 +142,8 @@ def stream(c, *args, **kwargs):
|
||||||
filepath = kwargs.pop('filepath')
|
filepath = kwargs.pop('filepath')
|
||||||
kwargs.update(conf)
|
kwargs.update(conf)
|
||||||
|
|
||||||
table_area = list(kwargs['table_area'])
|
table_areas = list(kwargs['table_areas'])
|
||||||
kwargs['table_area'] = None if not table_area else table_area
|
kwargs['table_areas'] = None if not table_areas else table_areas
|
||||||
columns = list(kwargs['columns'])
|
columns = list(kwargs['columns'])
|
||||||
kwargs['columns'] = None if not columns else columns
|
kwargs['columns'] = None if not columns else columns
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -24,7 +24,7 @@ def read_pdf(filepath, pages='1', flavor='lattice', suppress_warnings=False,
|
||||||
Lattice is used by default.
|
Lattice is used by default.
|
||||||
suppress_warnings : bool, optional (default: False)
|
suppress_warnings : bool, optional (default: False)
|
||||||
Prevent warnings from being emitted by Camelot.
|
Prevent warnings from being emitted by Camelot.
|
||||||
table_area : list, optional (default: None)
|
table_areas : list, optional (default: None)
|
||||||
List of table area strings of the form x1,y1,x2,y2
|
List of table area strings of the form x1,y1,x2,y2
|
||||||
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
|
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
|
||||||
in PDF coordinate space.
|
in PDF coordinate space.
|
||||||
|
|
|
||||||
|
|
@ -28,7 +28,7 @@ class Lattice(BaseParser):
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
table_area : list, optional (default: None)
|
table_areas : list, optional (default: None)
|
||||||
List of table area strings of the form x1,y1,x2,y2
|
List of table area strings of the form x1,y1,x2,y2
|
||||||
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
|
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
|
||||||
in PDF coordinate space.
|
in PDF coordinate space.
|
||||||
|
|
@ -76,12 +76,12 @@ class Lattice(BaseParser):
|
||||||
For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, table_area=None, process_background=False,
|
def __init__(self, table_areas=None, process_background=False,
|
||||||
line_size_scaling=15, copy_text=None, shift_text=['l', 't'],
|
line_size_scaling=15, copy_text=None, shift_text=['l', 't'],
|
||||||
split_text=False, flag_size=False, line_close_tol=2,
|
split_text=False, flag_size=False, line_close_tol=2,
|
||||||
joint_close_tol=2, threshold_blocksize=15, threshold_constant=-2,
|
joint_close_tol=2, threshold_blocksize=15, threshold_constant=-2,
|
||||||
iterations=0, margins=(1.0, 0.5, 0.1), **kwargs):
|
iterations=0, margins=(1.0, 0.5, 0.1), **kwargs):
|
||||||
self.table_area = table_area
|
self.table_areas = table_areas
|
||||||
self.process_background = process_background
|
self.process_background = process_background
|
||||||
self.line_size_scaling = line_size_scaling
|
self.line_size_scaling = line_size_scaling
|
||||||
self.copy_text = copy_text
|
self.copy_text = copy_text
|
||||||
|
|
@ -244,9 +244,9 @@ class Lattice(BaseParser):
|
||||||
self.threshold, direction='horizontal',
|
self.threshold, direction='horizontal',
|
||||||
line_size_scaling=self.line_size_scaling, iterations=self.iterations)
|
line_size_scaling=self.line_size_scaling, iterations=self.iterations)
|
||||||
|
|
||||||
if self.table_area is not None:
|
if self.table_areas is not None:
|
||||||
areas = []
|
areas = []
|
||||||
for area in self.table_area:
|
for area in self.table_areas:
|
||||||
x1, y1, x2, y2 = area.split(",")
|
x1, y1, x2, y2 = area.split(",")
|
||||||
x1 = float(x1)
|
x1 = float(x1)
|
||||||
y1 = float(y1)
|
y1 = float(y1)
|
||||||
|
|
|
||||||
|
|
@ -26,7 +26,7 @@ class Stream(BaseParser):
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
table_area : list, optional (default: None)
|
table_areas : list, optional (default: None)
|
||||||
List of table area strings of the form x1,y1,x2,y2
|
List of table area strings of the form x1,y1,x2,y2
|
||||||
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
|
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
|
||||||
in PDF coordinate space.
|
in PDF coordinate space.
|
||||||
|
|
@ -50,10 +50,10 @@ class Stream(BaseParser):
|
||||||
For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, table_area=None, columns=None, split_text=False,
|
def __init__(self, table_areas=None, columns=None, split_text=False,
|
||||||
flag_size=False, row_close_tol=2, col_close_tol=0,
|
flag_size=False, row_close_tol=2, col_close_tol=0,
|
||||||
margins=(1.0, 0.5, 0.1), **kwargs):
|
margins=(1.0, 0.5, 0.1), **kwargs):
|
||||||
self.table_area = table_area
|
self.table_areas = table_areas
|
||||||
self.columns = columns
|
self.columns = columns
|
||||||
self._validate_columns()
|
self._validate_columns()
|
||||||
self.split_text = split_text
|
self.split_text = split_text
|
||||||
|
|
@ -241,15 +241,15 @@ class Stream(BaseParser):
|
||||||
return cols
|
return cols
|
||||||
|
|
||||||
def _validate_columns(self):
|
def _validate_columns(self):
|
||||||
if self.table_area is not None and self.columns is not None:
|
if self.table_areas is not None and self.columns is not None:
|
||||||
if len(self.table_area) != len(self.columns):
|
if len(self.table_areas) != len(self.columns):
|
||||||
raise ValueError("Length of table_area and columns"
|
raise ValueError("Length of table_areas and columns"
|
||||||
" should be equal")
|
" should be equal")
|
||||||
|
|
||||||
def _generate_table_bbox(self):
|
def _generate_table_bbox(self):
|
||||||
if self.table_area is not None:
|
if self.table_areas is not None:
|
||||||
table_bbox = {}
|
table_bbox = {}
|
||||||
for area in self.table_area:
|
for area in self.table_areas:
|
||||||
x1, y1, x2, y2 = area.split(",")
|
x1, y1, x2, y2 = area.split(",")
|
||||||
x1 = float(x1)
|
x1 = float(x1)
|
||||||
y1 = float(y1)
|
y1 = float(y1)
|
||||||
|
|
|
||||||
|
|
@ -151,13 +151,13 @@ Specify table areas
|
||||||
|
|
||||||
Since :ref:`Stream <stream>` treats the whole page as a table, `for now`_, it's useful to specify table boundaries in cases such as `these <../_static/pdf/table_areas.pdf>`__. You can :ref:`plot the text <geometry_text>` on this page and note the top left and bottom right coordinates of the table.
|
Since :ref:`Stream <stream>` treats the whole page as a table, `for now`_, it's useful to specify table boundaries in cases such as `these <../_static/pdf/table_areas.pdf>`__. You can :ref:`plot the text <geometry_text>` on this page and note the top left and bottom right coordinates of the table.
|
||||||
|
|
||||||
Table areas that you want Camelot to analyze can be passed as a list of comma-separated strings to :meth:`read_pdf() <camelot.read_pdf>`, using the ``table_area`` keyword argument.
|
Table areas that you want Camelot to analyze can be passed as a list of comma-separated strings to :meth:`read_pdf() <camelot.read_pdf>`, using the ``table_areas`` keyword argument.
|
||||||
|
|
||||||
.. _for now: https://github.com/socialcopsdev/camelot/issues/102
|
.. _for now: https://github.com/socialcopsdev/camelot/issues/102
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
>>> tables = camelot.read_pdf('table_areas.pdf', flavor='stream', table_area=['316,499,566,337'])
|
>>> tables = camelot.read_pdf('table_areas.pdf', flavor='stream', table_areas=['316,499,566,337'])
|
||||||
>>> tables[0].df
|
>>> tables[0].df
|
||||||
|
|
||||||
.. csv-table::
|
.. csv-table::
|
||||||
|
|
@ -172,7 +172,7 @@ You can pass the column separators as a list of comma-separated strings to :meth
|
||||||
|
|
||||||
In case you passed a single column separators string list, and no table area is specified, the separators will be applied to the whole page. When a list of table areas is specified and you need to specify column separators as well, **the length of both lists should be equal**. Each table area will be mapped to each column separators' string using their indices.
|
In case you passed a single column separators string list, and no table area is specified, the separators will be applied to the whole page. When a list of table areas is specified and you need to specify column separators as well, **the length of both lists should be equal**. Each table area will be mapped to each column separators' string using their indices.
|
||||||
|
|
||||||
For example, if you have specified two table areas, ``table_area=['12,23,43,54', '20,33,55,67']``, and only want to specify column separators for the first table, you can pass an empty string for the second table in the column separators' list like this, ``columns=['10,120,200,400', '']``.
|
For example, if you have specified two table areas, ``table_areas=['12,23,43,54', '20,33,55,67']``, and only want to specify column separators for the first table, you can pass an empty string for the second table in the column separators' list like this, ``columns=['10,120,200,400', '']``.
|
||||||
|
|
||||||
Let's get back to the *x* coordinates we got from :ref:`plotting text <geometry_text>` that exists on this `PDF <../_static/pdf/column_separators.pdf>`__, and get the table out!
|
Let's get back to the *x* coordinates we got from :ref:`plotting text <geometry_text>` that exists on this `PDF <../_static/pdf/column_separators.pdf>`__, and get the table out!
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -81,7 +81,7 @@ data_stream_table_rotated = [
|
||||||
["", "", "", "", "", "", "", "", "54", "", "", "", "", "", "", "", "", ""]
|
["", "", "", "", "", "", "", "", "54", "", "", "", "", "", "", "", "", ""]
|
||||||
]
|
]
|
||||||
|
|
||||||
data_stream_table_area = [
|
data_stream_table_areas = [
|
||||||
["", "One Withholding"],
|
["", "One Withholding"],
|
||||||
["Payroll Period", "Allowance"],
|
["Payroll Period", "Allowance"],
|
||||||
["Weekly", "$71.15"],
|
["Weekly", "$71.15"],
|
||||||
|
|
@ -261,7 +261,7 @@ data_lattice_table_rotated = [
|
||||||
["Pooled", "38742", "53618", "60601", "86898", "4459", "21918", "27041", "14312", "18519"]
|
["Pooled", "38742", "53618", "60601", "86898", "4459", "21918", "27041", "14312", "18519"]
|
||||||
]
|
]
|
||||||
|
|
||||||
data_lattice_table_area = [
|
data_lattice_table_areas = [
|
||||||
["", "", "", "", "", "", "", "", ""],
|
["", "", "", "", "", "", "", "", ""],
|
||||||
["State", "n", "Literacy Status", "", "", "", "", "", ""],
|
["State", "n", "Literacy Status", "", "", "", "", "", ""],
|
||||||
["", "", "Illiterate", "Read & Write", "1-4 std.", "5-8 std.", "9-12 std.", "College", ""],
|
["", "", "Illiterate", "Read & Write", "1-4 std.", "5-8 std.", "9-12 std.", "College", ""],
|
||||||
|
|
|
||||||
|
|
@ -45,11 +45,11 @@ def test_stream_table_rotated():
|
||||||
assert df.equals(tables[0].df)
|
assert df.equals(tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
def test_stream_table_area():
|
def test_stream_table_areas():
|
||||||
df = pd.DataFrame(data_stream_table_area)
|
df = pd.DataFrame(data_stream_table_areas)
|
||||||
|
|
||||||
filename = os.path.join(testdir, "tabula/us-007.pdf")
|
filename = os.path.join(testdir, "tabula/us-007.pdf")
|
||||||
tables = camelot.read_pdf(filename, flavor="stream", table_area=["320,500,573,335"])
|
tables = camelot.read_pdf(filename, flavor="stream", table_areas=["320,500,573,335"])
|
||||||
assert df.equals(tables[0].df)
|
assert df.equals(tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -100,11 +100,11 @@ def test_lattice_table_rotated():
|
||||||
assert df.equals(tables[0].df)
|
assert df.equals(tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
def test_lattice_table_area():
|
def test_lattice_table_areas():
|
||||||
df = pd.DataFrame(data_lattice_table_area)
|
df = pd.DataFrame(data_lattice_table_areas)
|
||||||
|
|
||||||
filename = os.path.join(testdir, "twotables_2.pdf")
|
filename = os.path.join(testdir, "twotables_2.pdf")
|
||||||
tables = camelot.read_pdf(filename, table_area=["80,693,535,448"])
|
tables = camelot.read_pdf(filename, table_areas=["80,693,535,448"])
|
||||||
assert df.equals(tables[0].df)
|
assert df.equals(tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -34,11 +34,11 @@ def test_unsupported_format():
|
||||||
|
|
||||||
|
|
||||||
def test_stream_equal_length():
|
def test_stream_equal_length():
|
||||||
message = ("Length of table_area and columns"
|
message = ("Length of table_areas and columns"
|
||||||
" should be equal")
|
" should be equal")
|
||||||
with pytest.raises(ValueError, message=message):
|
with pytest.raises(ValueError, message=message):
|
||||||
tables = camelot.read_pdf(filename, flavor='stream',
|
tables = camelot.read_pdf(filename, flavor='stream',
|
||||||
table_area=['10,20,30,40'], columns=['10,20,30,40', '10,20,30,40'])
|
table_areas=['10,20,30,40'], columns=['10,20,30,40', '10,20,30,40'])
|
||||||
|
|
||||||
|
|
||||||
def test_no_tables_found():
|
def test_no_tables_found():
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue