From 32df09ad1c434af871a70a070a045ce1f14c2b47 Mon Sep 17 00:00:00 2001 From: Parth P Panchal Date: Wed, 24 Oct 2018 23:06:53 +0530 Subject: [PATCH] Renames the keyword `table_area` to `table_areas` (#171) `table_areas` sounds more apt since it is a list and there can be multiple table areas on a page. Closes #165 --- camelot/cli.py | 12 ++++++------ camelot/io.py | 2 +- camelot/parsers/lattice.py | 10 +++++----- camelot/parsers/stream.py | 16 ++++++++-------- docs/user/advanced.rst | 6 +++--- tests/data.py | 4 ++-- tests/test_common.py | 12 ++++++------ tests/test_errors.py | 4 ++-- 8 files changed, 33 insertions(+), 33 deletions(-) diff --git a/camelot/cli.py b/camelot/cli.py index e400204..8385450 100644 --- a/camelot/cli.py +++ b/camelot/cli.py @@ -48,7 +48,7 @@ def cli(ctx, *args, **kwargs): @cli.command('lattice') -@click.option('-T', '--table_area', default=[], multiple=True, +@click.option('-T', '--table_areas', default=[], multiple=True, help='Table areas to process. Example: x1,y1,x2,y2' ' where x1, y1 -> left-top and x2, y2 -> right-bottom.') @click.option('-back', '--process_background', is_flag=True, @@ -95,8 +95,8 @@ def lattice(c, *args, **kwargs): filepath = kwargs.pop('filepath') kwargs.update(conf) - table_area = list(kwargs['table_area']) - kwargs['table_area'] = None if not table_area else table_area + table_areas = list(kwargs['table_areas']) + kwargs['table_areas'] = None if not table_areas else table_areas copy_text = list(kwargs['copy_text']) kwargs['copy_text'] = None if not copy_text else copy_text kwargs['shift_text'] = list(kwargs['shift_text']) @@ -116,7 +116,7 @@ def lattice(c, *args, **kwargs): @cli.command('stream') -@click.option('-T', '--table_area', default=[], multiple=True, +@click.option('-T', '--table_areas', default=[], multiple=True, help='Table areas to process. Example: x1,y1,x2,y2' ' where x1, y1 -> left-top and x2, y2 -> right-bottom.') @click.option('-C', '--columns', default=[], multiple=True, @@ -142,8 +142,8 @@ def stream(c, *args, **kwargs): filepath = kwargs.pop('filepath') kwargs.update(conf) - table_area = list(kwargs['table_area']) - kwargs['table_area'] = None if not table_area else table_area + table_areas = list(kwargs['table_areas']) + kwargs['table_areas'] = None if not table_areas else table_areas columns = list(kwargs['columns']) kwargs['columns'] = None if not columns else columns diff --git a/camelot/io.py b/camelot/io.py index 5cdb542..06a31a9 100644 --- a/camelot/io.py +++ b/camelot/io.py @@ -24,7 +24,7 @@ def read_pdf(filepath, pages='1', flavor='lattice', suppress_warnings=False, Lattice is used by default. suppress_warnings : bool, optional (default: False) Prevent warnings from being emitted by Camelot. - table_area : list, optional (default: None) + table_areas : list, optional (default: None) List of table area strings of the form x1,y1,x2,y2 where (x1, y1) -> left-top and (x2, y2) -> right-bottom in PDF coordinate space. diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py index c2340ec..7b7c411 100644 --- a/camelot/parsers/lattice.py +++ b/camelot/parsers/lattice.py @@ -28,7 +28,7 @@ class Lattice(BaseParser): Parameters ---------- - table_area : list, optional (default: None) + table_areas : list, optional (default: None) List of table area strings of the form x1,y1,x2,y2 where (x1, y1) -> left-top and (x2, y2) -> right-bottom in PDF coordinate space. @@ -76,12 +76,12 @@ class Lattice(BaseParser): For more information, refer `PDFMiner docs `_. """ - def __init__(self, table_area=None, process_background=False, + def __init__(self, table_areas=None, process_background=False, line_size_scaling=15, copy_text=None, shift_text=['l', 't'], split_text=False, flag_size=False, line_close_tol=2, joint_close_tol=2, threshold_blocksize=15, threshold_constant=-2, iterations=0, margins=(1.0, 0.5, 0.1), **kwargs): - self.table_area = table_area + self.table_areas = table_areas self.process_background = process_background self.line_size_scaling = line_size_scaling self.copy_text = copy_text @@ -244,9 +244,9 @@ class Lattice(BaseParser): self.threshold, direction='horizontal', line_size_scaling=self.line_size_scaling, iterations=self.iterations) - if self.table_area is not None: + if self.table_areas is not None: areas = [] - for area in self.table_area: + for area in self.table_areas: x1, y1, x2, y2 = area.split(",") x1 = float(x1) y1 = float(y1) diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index 2792d82..6aee966 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -26,7 +26,7 @@ class Stream(BaseParser): Parameters ---------- - table_area : list, optional (default: None) + table_areas : list, optional (default: None) List of table area strings of the form x1,y1,x2,y2 where (x1, y1) -> left-top and (x2, y2) -> right-bottom in PDF coordinate space. @@ -50,10 +50,10 @@ class Stream(BaseParser): For more information, refer `PDFMiner docs `_. """ - def __init__(self, table_area=None, columns=None, split_text=False, + def __init__(self, table_areas=None, columns=None, split_text=False, flag_size=False, row_close_tol=2, col_close_tol=0, margins=(1.0, 0.5, 0.1), **kwargs): - self.table_area = table_area + self.table_areas = table_areas self.columns = columns self._validate_columns() self.split_text = split_text @@ -241,15 +241,15 @@ class Stream(BaseParser): return cols def _validate_columns(self): - if self.table_area is not None and self.columns is not None: - if len(self.table_area) != len(self.columns): - raise ValueError("Length of table_area and columns" + if self.table_areas is not None and self.columns is not None: + if len(self.table_areas) != len(self.columns): + raise ValueError("Length of table_areas and columns" " should be equal") def _generate_table_bbox(self): - if self.table_area is not None: + if self.table_areas is not None: table_bbox = {} - for area in self.table_area: + for area in self.table_areas: x1, y1, x2, y2 = area.split(",") x1 = float(x1) y1 = float(y1) diff --git a/docs/user/advanced.rst b/docs/user/advanced.rst index b012c6d..e697949 100644 --- a/docs/user/advanced.rst +++ b/docs/user/advanced.rst @@ -151,13 +151,13 @@ Specify table areas Since :ref:`Stream ` treats the whole page as a table, `for now`_, it's useful to specify table boundaries in cases such as `these <../_static/pdf/table_areas.pdf>`__. You can :ref:`plot the text ` on this page and note the top left and bottom right coordinates of the table. -Table areas that you want Camelot to analyze can be passed as a list of comma-separated strings to :meth:`read_pdf() `, using the ``table_area`` keyword argument. +Table areas that you want Camelot to analyze can be passed as a list of comma-separated strings to :meth:`read_pdf() `, using the ``table_areas`` keyword argument. .. _for now: https://github.com/socialcopsdev/camelot/issues/102 :: - >>> tables = camelot.read_pdf('table_areas.pdf', flavor='stream', table_area=['316,499,566,337']) + >>> tables = camelot.read_pdf('table_areas.pdf', flavor='stream', table_areas=['316,499,566,337']) >>> tables[0].df .. csv-table:: @@ -172,7 +172,7 @@ You can pass the column separators as a list of comma-separated strings to :meth In case you passed a single column separators string list, and no table area is specified, the separators will be applied to the whole page. When a list of table areas is specified and you need to specify column separators as well, **the length of both lists should be equal**. Each table area will be mapped to each column separators' string using their indices. -For example, if you have specified two table areas, ``table_area=['12,23,43,54', '20,33,55,67']``, and only want to specify column separators for the first table, you can pass an empty string for the second table in the column separators' list like this, ``columns=['10,120,200,400', '']``. +For example, if you have specified two table areas, ``table_areas=['12,23,43,54', '20,33,55,67']``, and only want to specify column separators for the first table, you can pass an empty string for the second table in the column separators' list like this, ``columns=['10,120,200,400', '']``. Let's get back to the *x* coordinates we got from :ref:`plotting text ` that exists on this `PDF <../_static/pdf/column_separators.pdf>`__, and get the table out! diff --git a/tests/data.py b/tests/data.py index 3a04d24..00e070a 100755 --- a/tests/data.py +++ b/tests/data.py @@ -81,7 +81,7 @@ data_stream_table_rotated = [ ["", "", "", "", "", "", "", "", "54", "", "", "", "", "", "", "", "", ""] ] -data_stream_table_area = [ +data_stream_table_areas = [ ["", "One Withholding"], ["Payroll Period", "Allowance"], ["Weekly", "$71.15"], @@ -261,7 +261,7 @@ data_lattice_table_rotated = [ ["Pooled", "38742", "53618", "60601", "86898", "4459", "21918", "27041", "14312", "18519"] ] -data_lattice_table_area = [ +data_lattice_table_areas = [ ["", "", "", "", "", "", "", "", ""], ["State", "n", "Literacy Status", "", "", "", "", "", ""], ["", "", "Illiterate", "Read & Write", "1-4 std.", "5-8 std.", "9-12 std.", "College", ""], diff --git a/tests/test_common.py b/tests/test_common.py index 2e7c24e..1f599fd 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -45,11 +45,11 @@ def test_stream_table_rotated(): assert df.equals(tables[0].df) -def test_stream_table_area(): - df = pd.DataFrame(data_stream_table_area) +def test_stream_table_areas(): + df = pd.DataFrame(data_stream_table_areas) filename = os.path.join(testdir, "tabula/us-007.pdf") - tables = camelot.read_pdf(filename, flavor="stream", table_area=["320,500,573,335"]) + tables = camelot.read_pdf(filename, flavor="stream", table_areas=["320,500,573,335"]) assert df.equals(tables[0].df) @@ -100,11 +100,11 @@ def test_lattice_table_rotated(): assert df.equals(tables[0].df) -def test_lattice_table_area(): - df = pd.DataFrame(data_lattice_table_area) +def test_lattice_table_areas(): + df = pd.DataFrame(data_lattice_table_areas) filename = os.path.join(testdir, "twotables_2.pdf") - tables = camelot.read_pdf(filename, table_area=["80,693,535,448"]) + tables = camelot.read_pdf(filename, table_areas=["80,693,535,448"]) assert df.equals(tables[0].df) diff --git a/tests/test_errors.py b/tests/test_errors.py index 095e5d5..89db31d 100755 --- a/tests/test_errors.py +++ b/tests/test_errors.py @@ -34,11 +34,11 @@ def test_unsupported_format(): def test_stream_equal_length(): - message = ("Length of table_area and columns" + message = ("Length of table_areas and columns" " should be equal") with pytest.raises(ValueError, message=message): tables = camelot.read_pdf(filename, flavor='stream', - table_area=['10,20,30,40'], columns=['10,20,30,40', '10,20,30,40']) + table_areas=['10,20,30,40'], columns=['10,20,30,40', '10,20,30,40']) def test_no_tables_found():