Merge pull request #243 from socialcopsdev/add-table-regions

[MRG] Add table regions support
2019-01-04 22:00:11 +05:30 · 2019-01-04 22:00:11 +05:30 · 7cf409aa08
parent a5027e81c5 302a506e1c
commit 7cf409aa08
13 changed files with 162 additions and 60 deletions
--- a/HISTORY.md
+++ b/HISTORY.md
@ -4,6 +4,13 @@ Release History
 master
 ------

+**Improvements**
+
+* [#240](https://github.com/socialcopsdev/camelot/issues/209) Add support to analyze only certain page regions to look for tables. [#243](https://github.com/socialcopsdev/camelot/pull/243) by Vinayak Mehta.
+    * You can use `table_regions` in `read_pdf()` to specify approximate page regions which may contain tables.
+    * Kwarg `line_size_scaling` is now called `line_scale`.
+* [#239](https://github.com/socialcopsdev/camelot/issues/239) Raise warning if PDF is image-based. [#240](https://github.com/socialcopsdev/camelot/pull/240) by Vinayak Mehta.
+
 0.6.0 (2018-12-24)
 ------------------

--- a/camelot/cli.py
+++ b/camelot/cli.py
@ -56,12 +56,15 @@ def cli(ctx, *args, **kwargs):


@cli.command('lattice')
+@click.option('-R', '--table_regions', default=[], multiple=True,
+              help='Page regions to analyze. Example: x1,y1,x2,y2'
+              ' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
@click.option('-T', '--table_areas', default=[], multiple=True,
              help='Table areas to process. Example: x1,y1,x2,y2'
              ' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
@click.option('-back', '--process_background', is_flag=True,
              help='Process background lines.')
-@click.option('-scale', '--line_size_scaling', default=15,
+@click.option('-scale', '--line_scale', default=15,
              help='Line size scaling factor. The larger the value,'
              ' the smaller the detected lines.')
@click.option('-copy', '--copy_text', default=[], type=click.Choice(['h', 'v']),
@ -105,6 +108,8 @@ def lattice(c, *args, **kwargs):
    filepath = kwargs.pop('filepath')
    kwargs.update(conf)

+    table_regions = list(kwargs['table_regions'])
+    kwargs['table_regions'] = None if not table_regions else table_regions
    table_areas = list(kwargs['table_areas'])
    kwargs['table_areas'] = None if not table_areas else table_areas
    copy_text = list(kwargs['copy_text'])
@ -132,6 +137,9 @@ def lattice(c, *args, **kwargs):


@cli.command('stream')
+@click.option('-R', '--table_regions', default=[], multiple=True,
+              help='Page regions to analyze. Example: x1,y1,x2,y2'
+              ' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
@click.option('-T', '--table_areas', default=[], multiple=True,
              help='Table areas to process. Example: x1,y1,x2,y2'
              ' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
@ -160,6 +168,8 @@ def stream(c, *args, **kwargs):
    filepath = kwargs.pop('filepath')
    kwargs.update(conf)

+    table_regions = list(kwargs['table_regions'])
+    kwargs['table_regions'] = None if not table_regions else table_regions
    table_areas = list(kwargs['table_areas'])
    kwargs['table_areas'] = None if not table_areas else table_areas
    columns = list(kwargs['columns'])
--- a/camelot/image_processing.py
+++ b/camelot/image_processing.py
@ -48,7 +48,8 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
    return img, threshold


-def find_lines(threshold, direction='horizontal', line_size_scaling=15, iterations=0):
+def find_lines(threshold, regions=None, direction='horizontal',
+               line_scale=15, iterations=0):
    """Finds horizontal and vertical lines by applying morphological
    transformations on an image.

@ -56,9 +57,13 @@ def find_lines(threshold, direction='horizontal', line_size_scaling=15, iteratio
    ----------
    threshold : object
        numpy.ndarray representing the thresholded image.
+    regions : list, optional (default: None)
+        List of page regions that may contain tables of the form x1,y1,x2,y2
+        where (x1, y1) -> left-top and (x2, y2) -> right-bottom
+        in image coordinate space.
    direction : string, optional (default: 'horizontal')
        Specifies whether to find vertical or horizontal lines.
-    line_size_scaling : int, optional (default: 15)
+    line_scale : int, optional (default: 15)
        Factor by which the page dimensions will be divided to get
        smallest length of lines that should be detected.

@ -83,26 +88,33 @@ def find_lines(threshold, direction='horizontal', line_size_scaling=15, iteratio
    lines = []

    if direction == 'vertical':
-        size = threshold.shape[0] // line_size_scaling
+        size = threshold.shape[0] // line_scale
        el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size))
    elif direction == 'horizontal':
-        size = threshold.shape[1] // line_size_scaling
+        size = threshold.shape[1] // line_scale
        el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1))
    elif direction is None:
        raise ValueError("Specify direction as either 'vertical' or"
                         " 'horizontal'")

+    if regions is not None:
+        region_mask = np.zeros(threshold.shape)
+        for region in regions:
+            x, y, w, h = region
+            region_mask[y : y + h, x : x + w] = 1
+        threshold = np.multiply(threshold, region_mask)
+
    threshold = cv2.erode(threshold, el)
    threshold = cv2.dilate(threshold, el)
    dmask = cv2.dilate(threshold, el, iterations=iterations)

    try:
        _, contours, _ = cv2.findContours(
-            threshold, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+            threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    except ValueError:
        # for opencv backward compatibility
        contours, _ = cv2.findContours(
-            threshold, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+            threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    for c in contours:
        x, y, w, h = cv2.boundingRect(c)
@ -116,7 +128,7 @@ def find_lines(threshold, direction='horizontal', line_size_scaling=15, iteratio
    return dmask, lines


-def find_table_contours(vertical, horizontal):
+def find_contours(vertical, horizontal):
    """Finds table boundaries using OpenCV's findContours.

    Parameters
@ -138,11 +150,12 @@ def find_table_contours(vertical, horizontal):

    try:
        __, contours, __ = cv2.findContours(
-            mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+            mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    except ValueError:
        # for opencv backward compatibility
        contours, __ = cv2.findContours(
-            mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+            mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    # sort in reverse based on contour area and use first 10 contours
    contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]

    cont = []
@ -153,7 +166,7 @@ def find_table_contours(vertical, horizontal):
    return cont


-def find_table_joints(contours, vertical, horizontal):
+def find_joints(contours, vertical, horizontal):
    """Finds joints/intersections present inside each table boundary.

    Parameters
@ -176,18 +189,18 @@ def find_table_joints(contours, vertical, horizontal):
        and (x2, y2) -> rt in image coordinate space.

    """
-    joints = np.bitwise_and(vertical, horizontal)
+    joints = np.multiply(vertical, horizontal)
    tables = {}
    for c in contours:
        x, y, w, h = c
        roi = joints[y : y + h, x : x + w]
        try:
            __, jc, __ = cv2.findContours(
-                roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
+                roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
        except ValueError:
            # for opencv backward compatibility
            jc, __ = cv2.findContours(
-                roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
+                roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
        if len(jc) <= 4:  # remove contours with less than 4 joints
            continue
        joint_coords = []
--- a/camelot/io.py
+++ b/camelot/io.py
@ -52,7 +52,7 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice',
        to generate columns.
    process_background* : bool, optional (default: False)
        Process background lines.
-    line_size_scaling* : int, optional (default: 15)
+    line_scale* : int, optional (default: 15)
        Line size scaling factor. The larger the value the smaller
        the detected lines. Making it very large will lead to text
        being detected as lines.
--- a/camelot/parsers/lattice.py
+++ b/camelot/parsers/lattice.py
@ -16,7 +16,7 @@ from ..utils import (scale_image, scale_pdf, segments_in_bbox, text_in_bbox,
                     merge_close_lines, get_table_index, compute_accuracy,
                     compute_whitespace)
 from ..image_processing import (adaptive_threshold, find_lines,
-                                find_table_contours, find_table_joints)
+                                find_contours, find_joints)


 logger = logging.getLogger('camelot')
@ -28,13 +28,17 @@ class Lattice(BaseParser):

    Parameters
    ----------
+    table_regions : list, optional (default: None)
+        List of page regions that may contain tables of the form x1,y1,x2,y2
+        where (x1, y1) -> left-top and (x2, y2) -> right-bottom
+        in PDF coordinate space.
    table_areas : list, optional (default: None)
        List of table area strings of the form x1,y1,x2,y2
        where (x1, y1) -> left-top and (x2, y2) -> right-bottom
        in PDF coordinate space.
    process_background : bool, optional (default: False)
        Process background lines.
-    line_size_scaling : int, optional (default: 15)
+    line_scale : int, optional (default: 15)
        Line size scaling factor. The larger the value the smaller
        the detected lines. Making it very large will lead to text
        being detected as lines.
@ -77,14 +81,15 @@ class Lattice(BaseParser):
        Resolution used for PDF to PNG conversion.

    """
-    def __init__(self, table_areas=None, process_background=False,
-                 line_size_scaling=15, copy_text=None, shift_text=['l', 't'],
+    def __init__(self, table_regions=None, table_areas=None, process_background=False,
+                 line_scale=15, copy_text=None, shift_text=['l', 't'],
                 split_text=False, flag_size=False, strip_text='', line_tol=2,
                 joint_tol=2, threshold_blocksize=15, threshold_constant=-2,
                 iterations=0, resolution=300, **kwargs):
+        self.table_regions = table_regions
        self.table_areas = table_areas
        self.process_background = process_background
-        self.line_size_scaling = line_size_scaling
+        self.line_scale = line_scale
        self.copy_text = copy_text
        self.shift_text = shift_text
        self.split_text = split_text
@ -227,9 +232,22 @@ class Lattice(BaseParser):
            stderr=subprocess.STDOUT)

    def _generate_table_bbox(self):
+        def scale_areas(areas):
+            scaled_areas = []
+            for area in areas:
+                x1, y1, x2, y2 = area.split(",")
+                x1 = float(x1)
+                y1 = float(y1)
+                x2 = float(x2)
+                y2 = float(y2)
+                x1, y1, x2, y2 = scale_pdf((x1, y1, x2, y2), image_scalers)
+                scaled_areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1)))
+            return scaled_areas
+
        self.image, self.threshold = adaptive_threshold(
            self.imagename, process_background=self.process_background,
            blocksize=self.threshold_blocksize, c=self.threshold_constant)
+
        image_width = self.image.shape[1]
        image_height = self.image.shape[0]
        image_width_scaler = image_width / float(self.pdf_width)
@ -239,27 +257,30 @@ class Lattice(BaseParser):
        image_scalers = (image_width_scaler, image_height_scaler, self.pdf_height)
        pdf_scalers = (pdf_width_scaler, pdf_height_scaler, image_height)

-        vertical_mask, vertical_segments = find_lines(
-            self.threshold, direction='vertical',
-            line_size_scaling=self.line_size_scaling, iterations=self.iterations)
-        horizontal_mask, horizontal_segments = find_lines(
-            self.threshold, direction='horizontal',
-            line_size_scaling=self.line_size_scaling, iterations=self.iterations)
+        if self.table_areas is None:
+            regions = None
+            if self.table_regions is not None:
+                regions = scale_areas(self.table_regions)

-        if self.table_areas is not None:
-            areas = []
-            for area in self.table_areas:
-                x1, y1, x2, y2 = area.split(",")
-                x1 = float(x1)
-                y1 = float(y1)
-                x2 = float(x2)
-                y2 = float(y2)
-                x1, y1, x2, y2 = scale_pdf((x1, y1, x2, y2), image_scalers)
-                areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1)))
-            table_bbox = find_table_joints(areas, vertical_mask, horizontal_mask)
+            vertical_mask, vertical_segments = find_lines(
+                self.threshold, regions=regions, direction='vertical',
+                line_scale=self.line_scale, iterations=self.iterations)
+            horizontal_mask, horizontal_segments = find_lines(
+                self.threshold, regions=regions, direction='horizontal',
+                line_scale=self.line_scale, iterations=self.iterations)
+
+            contours = find_contours(vertical_mask, horizontal_mask)
+            table_bbox = find_joints(contours, vertical_mask, horizontal_mask)
        else:
-            contours = find_table_contours(vertical_mask, horizontal_mask)
-            table_bbox = find_table_joints(contours, vertical_mask, horizontal_mask)
+            vertical_mask, vertical_segments = find_lines(
+                self.threshold, direction='vertical', line_scale=self.line_scale,
+                iterations=self.iterations)
+            horizontal_mask, horizontal_segments = find_lines(
+                self.threshold, direction='horizontal', line_scale=self.line_scale,
+                iterations=self.iterations)
+
+            areas = scale_areas(self.table_areas)
+            table_bbox = find_joints(areas, vertical_mask, horizontal_mask)

        self.table_bbox_unscaled = copy.deepcopy(table_bbox)

--- a/camelot/parsers/stream.py
+++ b/camelot/parsers/stream.py
@ -26,6 +26,10 @@ class Stream(BaseParser):

    Parameters
    ----------
+    table_regions : list, optional (default: None)
+        List of page regions that may contain tables of the form x1,y1,x2,y2
+        where (x1, y1) -> left-top and (x2, y2) -> right-bottom
+        in PDF coordinate space.
    table_areas : list, optional (default: None)
        List of table area strings of the form x1,y1,x2,y2
        where (x1, y1) -> left-top and (x2, y2) -> right-bottom
@ -51,9 +55,10 @@ class Stream(BaseParser):
        to generate columns.

    """
-    def __init__(self, table_areas=None, columns=None, split_text=False,
+    def __init__(self, table_regions=None, table_areas=None, columns=None, split_text=False,
                 flag_size=False, strip_text='', edge_tol=50, row_tol=2,
                 column_tol=0, **kwargs):
+        self.table_regions = table_regions
        self.table_areas = table_areas
        self.columns = columns
        self._validate_columns()
@ -275,7 +280,18 @@ class Stream(BaseParser):

    def _generate_table_bbox(self):
        self.textedges = []
-        if self.table_areas is not None:
+        if self.table_areas is None:
+            hor_text = self.horizontal_text
+            if self.table_regions is not None:
+                # filter horizontal text
+                hor_text = []
+                for region in self.table_regions:
+                    x1, y1, x2, y2 = region
+                    region_text = text_in_bbox((x1, y2, x2, y1), self.horizontal_text)
+                    hor_text.extend(region_text)
+            # find tables based on nurminen's detection algorithm
+            table_bbox = self._nurminen_table_detection(hor_text)
+        else:
            table_bbox = {}
            for area in self.table_areas:
                x1, y1, x2, y2 = area.split(",")
@ -284,9 +300,6 @@ class Stream(BaseParser):
                x2 = float(x2)
                y2 = float(y2)
                table_bbox[(x1, y2, x2, y1)] = None
-        else:
-            # find tables based on nurminen's detection algorithm
-            table_bbox = self._nurminen_table_detection(self.horizontal_text)
        self.table_bbox = table_bbox

    def _generate_columns_and_rows(self, table_idx, tk):
--- a/camelot/utils.py
+++ b/camelot/utils.py
@ -101,7 +101,7 @@ stream_kwargs = [
 ]
 lattice_kwargs = [
    'process_background',
-    'line_size_scaling',
+    'line_scale',
    'copy_text',
    'shift_text',
    'line_tol',
@ -339,7 +339,7 @@ def text_in_bbox(bbox, text):
    ----------
    bbox : tuple
        Tuple (x1, y1, x2, y2) representing a bounding box where
-        (x1, y1) -> lb and (x2, y2) -> rt in PDFMiner coordinate
+        (x1, y1) -> lb and (x2, y2) -> rt in the PDF coordinate
        space.
    text : List of PDFMiner text objects.

--- a/docs/_static/csv/table_regions.csv
+++ b/docs/_static/csv/table_regions.csv
@ -0,0 +1,4 @@
+"Età dell’Assicuratoall’epoca del decesso","Misura % dimaggiorazione"
+"18-75","1,00%"
+"76-80","0,50%"
+"81 in poi","0,10%"
--- a/docs/_static/pdf/table_regions.pdf
+++ b/docs/_static/pdf/table_regions.pdf
--- a/docs/user/advanced.rst
+++ b/docs/user/advanced.rst
@ -206,12 +206,10 @@ You can also visualize the textedges found on a page by specifying ``kind='texte
 Specify table areas
 -------------------

-In cases such as `these <../_static/pdf/table_areas.pdf>`__, it can be useful to specify table boundaries. You can plot the text on this page and note the top left and bottom right coordinates of the table.
+In cases such as `these <../_static/pdf/table_areas.pdf>`__, it can be useful to specify exact table boundaries. You can plot the text on this page and note the top left and bottom right coordinates of the table.

 Table areas that you want Camelot to analyze can be passed as a list of comma-separated strings to :meth:`read_pdf() <camelot.read_pdf>`, using the ``table_areas`` keyword argument.

-.. _for now: https://github.com/socialcopsdev/camelot/issues/102
-
 ::

    >>> tables = camelot.read_pdf('table_areas.pdf', flavor='stream', table_areas=['316,499,566,337'])
@ -226,6 +224,27 @@ Table areas that you want Camelot to analyze can be passed as a list of comma-se
 .. csv-table::
  :file: ../_static/csv/table_areas.csv

+Specify table regions
+---------------------
+
+However there may be cases like `[1] <../_static/pdf/table_regions.pdf>`__ and `[2] <https://github.com/socialcopsdev/camelot/blob/master/tests/files/tableception.pdf>`__, where the table might not lie at the exact coordinates every time but in an approximate region.
+
+You can use the ``table_regions`` keyword argument to :meth:`read_pdf() <camelot.read_pdf>` to solve for such cases. When ``table_regions`` is specified, Camelot will only analyze the specified regions to look for tables.
+
+::
+
+    >>> tables = camelot.read_pdf('table_regions.pdf', table_regions=['170,370,560,270'])
+    >>> tables[0].df
+
+.. tip::
+    Here's how you can do the same with the :ref:`command-line interface <cli>`.
+    ::
+
+        $ camelot lattice -R 170,370,560,270 table_regions.pdf
+
+.. csv-table::
+  :file: ../_static/csv/table_regions.csv
+
 Specify column separators
 -------------------------

@ -434,11 +453,11 @@ You can pass ``row_tol=<+int>`` to group the rows closer together, as shown belo
 Detect short lines
 ------------------

-There might be cases while using :ref:`Lattice <lattice>` when smaller lines don't get detected. The size of the smallest line that gets detected is calculated by dividing the PDF page's dimensions with a scaling factor called ``line_size_scaling``. By default, its value is 15.
+There might be cases while using :ref:`Lattice <lattice>` when smaller lines don't get detected. The size of the smallest line that gets detected is calculated by dividing the PDF page's dimensions with a scaling factor called ``line_scale``. By default, its value is 15.

-As you can guess, the larger the ``line_size_scaling``, the smaller the size of lines getting detected.
+As you can guess, the larger the ``line_scale``, the smaller the size of lines getting detected.

-.. warning:: Making ``line_size_scaling`` very large (>150) will lead to text getting detected as lines.
+.. warning:: Making ``line_scale`` very large (>150) will lead to text getting detected as lines.

 Here's a `PDF <../_static/pdf/short_lines.pdf>`__ where small lines separating the the headers don't get detected with the default value of 15.

@ -458,11 +477,11 @@ Let's plot the table for this PDF.
    :alt: A plot of the PDF table with short lines
    :align: left

-Clearly, the smaller lines separating the headers, couldn't be detected. Let's try with ``line_size_scaling=40``, and plot the table again.
+Clearly, the smaller lines separating the headers, couldn't be detected. Let's try with ``line_scale=40``, and plot the table again.

 ::

-    >>> tables = camelot.read_pdf('short_lines.pdf', line_size_scaling=40)
+    >>> tables = camelot.read_pdf('short_lines.pdf', line_scale=40)
    >>> camelot.plot(tables[0], kind='grid')
    >>> plt.show()

@ -511,7 +530,7 @@ We'll use the `PDF <../_static/pdf/short_lines.pdf>`__ from the previous example

 ::

-    >>> tables = camelot.read_pdf('short_lines.pdf', line_size_scaling=40, shift_text=[''])
+    >>> tables = camelot.read_pdf('short_lines.pdf', line_scale=40, shift_text=[''])
    >>> tables[0].df

 .. csv-table::
@ -532,7 +551,7 @@ No surprises there — it did remain in place (observe the strings "2400" and "A

 ::

-    >>> tables = camelot.read_pdf('short_lines.pdf', line_size_scaling=40, shift_text=['r', 'b'])
+    >>> tables = camelot.read_pdf('short_lines.pdf', line_scale=40, shift_text=['r', 'b'])
    >>> tables[0].df

 .. tip::
--- a/tests/data.py
+++ b/tests/data.py
@ -427,6 +427,13 @@ data_lattice_two_tables_2 = [
    ["Pooled", "23889", "47.7", "1.5", "9.9", "19.9", "17.8", "3.3"]
 ]

+data_lattice_table_regions = [
+    ['Età dell’Assicurato \nall’epoca del decesso', 'Misura % di \nmaggiorazione'],
+    ['18-75', '1,00%'],
+    ['76-80', '0,50%'],
+    ['81 in poi', '0,10%']
+]
+
 data_lattice_table_areas = [
    ["", "", "", "", "", "", "", "", ""],
    ["State", "n", "Literacy Status", "", "", "", "", "", ""],
--- a/tests/files/table_region.pdf
+++ b/tests/files/table_region.pdf
--- a/tests/test_common.py
+++ b/tests/test_common.py
@ -159,6 +159,14 @@ def test_lattice_two_tables():
    assert df2.equals(tables[1].df)


+def test_lattice_table_regions():
+    df = pd.DataFrame(data_lattice_table_regions)
+
+    filename = os.path.join(testdir, "table_region.pdf")
+    tables = camelot.read_pdf(filename, table_regions=["170,370,560,270"])
+    assert df.equals(tables[0].df)
+
+
 def test_lattice_table_areas():
    df = pd.DataFrame(data_lattice_table_areas)

@ -179,7 +187,7 @@ def test_lattice_copy_text():
    df = pd.DataFrame(data_lattice_copy_text)

    filename = os.path.join(testdir, "row_span_1.pdf")
-    tables = camelot.read_pdf(filename, line_size_scaling=60, copy_text="v")
+    tables = camelot.read_pdf(filename, line_scale=60, copy_text="v")
    assert df.equals(tables[0].df)


@ -189,13 +197,13 @@ def test_lattice_shift_text():
    df_rb = pd.DataFrame(data_lattice_shift_text_right_bottom)

    filename = os.path.join(testdir, "column_span_2.pdf")
-    tables = camelot.read_pdf(filename, line_size_scaling=40)
+    tables = camelot.read_pdf(filename, line_scale=40)
    assert df_lt.equals(tables[0].df)

-    tables = camelot.read_pdf(filename, line_size_scaling=40, shift_text=[''])
+    tables = camelot.read_pdf(filename, line_scale=40, shift_text=[''])
    assert df_disable.equals(tables[0].df)

-    tables = camelot.read_pdf(filename, line_size_scaling=40, shift_text=['r', 'b'])
+    tables = camelot.read_pdf(filename, line_scale=40, shift_text=['r', 'b'])
    assert df_rb.equals(tables[0].df)