Add table regions support

2019-01-04 19:17:54 +05:30
parent a5027e81c5
commit 03f301b25c
8 changed files with 100 additions and 47 deletions
@@ -56,12 +56,15 @@ def cli(ctx, *args, **kwargs):


@cli.command('lattice')
+@click.option('-R', '--table_regions', default=[], multiple=True,
+              help='Page regions to analyze. Example: x1,y1,x2,y2'
+              ' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
@click.option('-T', '--table_areas', default=[], multiple=True,
              help='Table areas to process. Example: x1,y1,x2,y2'
              ' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
@click.option('-back', '--process_background', is_flag=True,
              help='Process background lines.')
-@click.option('-scale', '--line_size_scaling', default=15,
+@click.option('-scale', '--line_scale', default=15,
              help='Line size scaling factor. The larger the value,'
              ' the smaller the detected lines.')
@click.option('-copy', '--copy_text', default=[], type=click.Choice(['h', 'v']),
@@ -105,6 +108,8 @@ def lattice(c, *args, **kwargs):
    filepath = kwargs.pop('filepath')
    kwargs.update(conf)

+    table_regions = list(kwargs['table_regions'])
+    kwargs['table_regions'] = None if not table_regions else table_regions
    table_areas = list(kwargs['table_areas'])
    kwargs['table_areas'] = None if not table_areas else table_areas
    copy_text = list(kwargs['copy_text'])
@@ -132,6 +137,9 @@ def lattice(c, *args, **kwargs):


@cli.command('stream')
+@click.option('-R', '--table_regions', default=[], multiple=True,
+              help='Page regions to analyze. Example: x1,y1,x2,y2'
+              ' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
@click.option('-T', '--table_areas', default=[], multiple=True,
              help='Table areas to process. Example: x1,y1,x2,y2'
              ' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
@@ -160,6 +168,8 @@ def stream(c, *args, **kwargs):
    filepath = kwargs.pop('filepath')
    kwargs.update(conf)

+    table_regions = list(kwargs['table_regions'])
+    kwargs['table_regions'] = None if not table_regions else table_regions
    table_areas = list(kwargs['table_areas'])
    kwargs['table_areas'] = None if not table_areas else table_areas
    columns = list(kwargs['columns'])
@@ -48,7 +48,8 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
    return img, threshold


-def find_lines(threshold, direction='horizontal', line_size_scaling=15, iterations=0):
+def find_lines(threshold, regions=None, direction='horizontal',
+               line_scale=15, iterations=0):
    """Finds horizontal and vertical lines by applying morphological
    transformations on an image.

@@ -56,9 +57,13 @@ def find_lines(threshold, direction='horizontal', line_size_scaling=15, iteratio
    ----------
    threshold : object
        numpy.ndarray representing the thresholded image.
+    regions : list, optional (default: None)
+        List of page regions that may contain tables of the form x1,y1,x2,y2
+        where (x1, y1) -> left-top and (x2, y2) -> right-bottom
+        in image coordinate space.
    direction : string, optional (default: 'horizontal')
        Specifies whether to find vertical or horizontal lines.
-    line_size_scaling : int, optional (default: 15)
+    line_scale : int, optional (default: 15)
        Factor by which the page dimensions will be divided to get
        smallest length of lines that should be detected.

@@ -83,10 +88,10 @@ def find_lines(threshold, direction='horizontal', line_size_scaling=15, iteratio
    lines = []

    if direction == 'vertical':
-        size = threshold.shape[0] // line_size_scaling
+        size = threshold.shape[0] // line_scale
        el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size))
    elif direction == 'horizontal':
-        size = threshold.shape[1] // line_size_scaling
+        size = threshold.shape[1] // line_scale
        el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1))
    elif direction is None:
        raise ValueError("Specify direction as either 'vertical' or"
@@ -112,11 +117,17 @@ def find_lines(threshold, direction='horizontal', line_size_scaling=15, iteratio
            lines.append(((x1 + x2) // 2, y2, (x1 + x2) // 2, y1))
        elif direction == 'horizontal':
            lines.append((x1, (y1 + y2) // 2, x2, (y1 + y2) // 2))
+    if regions is not None:
+        region_mask = np.zeros(dmask.shape)
+        for region in regions:
+            x, y, w, h = region
+            region_mask[y : y + h, x : x + w] = 1
+        dmask = np.multiply(dmask, region_mask)

    return dmask, lines


-def find_table_contours(vertical, horizontal):
+def find_contours(vertical, horizontal):
    """Finds table boundaries using OpenCV's findContours.

    Parameters
@@ -138,11 +149,12 @@ def find_table_contours(vertical, horizontal):

    try:
        __, contours, __ = cv2.findContours(
-            mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+            mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    except ValueError:
        # for opencv backward compatibility
        contours, __ = cv2.findContours(
-            mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+            mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    # sort in reverse based on contour area and use first 10 contours
    contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]

    cont = []
@@ -153,7 +165,7 @@ def find_table_contours(vertical, horizontal):
    return cont


-def find_table_joints(contours, vertical, horizontal):
+def find_joints(contours, vertical, horizontal):
    """Finds joints/intersections present inside each table boundary.

    Parameters
@@ -176,18 +188,18 @@ def find_table_joints(contours, vertical, horizontal):
        and (x2, y2) -> rt in image coordinate space.

    """
-    joints = np.bitwise_and(vertical, horizontal)
+    joints = np.multiply(vertical, horizontal)
    tables = {}
    for c in contours:
        x, y, w, h = c
        roi = joints[y : y + h, x : x + w]
        try:
            __, jc, __ = cv2.findContours(
-                roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
+                roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
        except ValueError:
            # for opencv backward compatibility
            jc, __ = cv2.findContours(
-                roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
+                roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
        if len(jc) <= 4:  # remove contours with less than 4 joints
            continue
        joint_coords = []
@@ -52,7 +52,7 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice',
        to generate columns.
    process_background* : bool, optional (default: False)
        Process background lines.
-    line_size_scaling* : int, optional (default: 15)
+    line_scale* : int, optional (default: 15)
        Line size scaling factor. The larger the value the smaller
        the detected lines. Making it very large will lead to text
        being detected as lines.
@@ -16,7 +16,7 @@ from ..utils import (scale_image, scale_pdf, segments_in_bbox, text_in_bbox,
                     merge_close_lines, get_table_index, compute_accuracy,
                     compute_whitespace)
 from ..image_processing import (adaptive_threshold, find_lines,
-                                find_table_contours, find_table_joints)
+                                find_contours, find_joints)


 logger = logging.getLogger('camelot')
@@ -28,13 +28,17 @@ class Lattice(BaseParser):

    Parameters
    ----------
+    table_regions : list, optional (default: None)
+        List of page regions that may contain tables of the form x1,y1,x2,y2
+        where (x1, y1) -> left-top and (x2, y2) -> right-bottom
+        in PDF coordinate space.
    table_areas : list, optional (default: None)
        List of table area strings of the form x1,y1,x2,y2
        where (x1, y1) -> left-top and (x2, y2) -> right-bottom
        in PDF coordinate space.
    process_background : bool, optional (default: False)
        Process background lines.
-    line_size_scaling : int, optional (default: 15)
+    line_scale : int, optional (default: 15)
        Line size scaling factor. The larger the value the smaller
        the detected lines. Making it very large will lead to text
        being detected as lines.
@@ -77,14 +81,15 @@ class Lattice(BaseParser):
        Resolution used for PDF to PNG conversion.

    """
-    def __init__(self, table_areas=None, process_background=False,
-                 line_size_scaling=15, copy_text=None, shift_text=['l', 't'],
+    def __init__(self, table_regions=None, table_areas=None, process_background=False,
+                 line_scale=15, copy_text=None, shift_text=['l', 't'],
                 split_text=False, flag_size=False, strip_text='', line_tol=2,
                 joint_tol=2, threshold_blocksize=15, threshold_constant=-2,
                 iterations=0, resolution=300, **kwargs):
+        self.table_regions = table_regions
        self.table_areas = table_areas
        self.process_background = process_background
-        self.line_size_scaling = line_size_scaling
+        self.line_scale = line_scale
        self.copy_text = copy_text
        self.shift_text = shift_text
        self.split_text = split_text
@@ -239,14 +244,35 @@ class Lattice(BaseParser):
        image_scalers = (image_width_scaler, image_height_scaler, self.pdf_height)
        pdf_scalers = (pdf_width_scaler, pdf_height_scaler, image_height)

-        vertical_mask, vertical_segments = find_lines(
-            self.threshold, direction='vertical',
-            line_size_scaling=self.line_size_scaling, iterations=self.iterations)
-        horizontal_mask, horizontal_segments = find_lines(
-            self.threshold, direction='horizontal',
-            line_size_scaling=self.line_size_scaling, iterations=self.iterations)
+        if self.table_areas is None:
+            regions = None
+            if self.table_regions is not None:
+                regions = []
+                for region in self.table_regions:
+                    x1, y1, x2, y2 = region.split(",")
+                    x1 = float(x1)
+                    y1 = float(y1)
+                    x2 = float(x2)
+                    y2 = float(y2)
+                    x1, y1, x2, y2 = scale_pdf((x1, y1, x2, y2), image_scalers)
+                    regions.append((x1, y1, abs(x2 - x1), abs(y2 - y1)))
+            vertical_mask, vertical_segments = find_lines(
+                self.threshold, regions=regions, direction='vertical',
+                line_scale=self.line_scale, iterations=self.iterations)
+            horizontal_mask, horizontal_segments = find_lines(
+                self.threshold, regions=regions, direction='horizontal',
+                line_scale=self.line_scale, iterations=self.iterations)
+
+            contours = find_contours(vertical_mask, horizontal_mask)
+            table_bbox = find_joints(contours, vertical_mask, horizontal_mask)
+        else:
+            vertical_mask, vertical_segments = find_lines(
+                self.threshold, direction='vertical', line_scale=self.line_scale,
+                iterations=self.iterations)
+            horizontal_mask, horizontal_segments = find_lines(
+                self.threshold, direction='horizontal', line_scale=self.line_scale,
+                iterations=self.iterations)

-        if self.table_areas is not None:
            areas = []
            for area in self.table_areas:
                x1, y1, x2, y2 = area.split(",")
@@ -256,10 +282,7 @@ class Lattice(BaseParser):
                y2 = float(y2)
                x1, y1, x2, y2 = scale_pdf((x1, y1, x2, y2), image_scalers)
                areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1)))
-            table_bbox = find_table_joints(areas, vertical_mask, horizontal_mask)
-        else:
-            contours = find_table_contours(vertical_mask, horizontal_mask)
-            table_bbox = find_table_joints(contours, vertical_mask, horizontal_mask)
+            table_bbox = find_joints(areas, vertical_mask, horizontal_mask)

        self.table_bbox_unscaled = copy.deepcopy(table_bbox)

@@ -26,6 +26,10 @@ class Stream(BaseParser):

    Parameters
    ----------
+    table_regions : list, optional (default: None)
+        List of page regions that may contain tables of the form x1,y1,x2,y2
+        where (x1, y1) -> left-top and (x2, y2) -> right-bottom
+        in PDF coordinate space.
    table_areas : list, optional (default: None)
        List of table area strings of the form x1,y1,x2,y2
        where (x1, y1) -> left-top and (x2, y2) -> right-bottom
@@ -51,9 +55,10 @@ class Stream(BaseParser):
        to generate columns.

    """
-    def __init__(self, table_areas=None, columns=None, split_text=False,
+    def __init__(self, table_regions=None, table_areas=None, columns=None, split_text=False,
                 flag_size=False, strip_text='', edge_tol=50, row_tol=2,
                 column_tol=0, **kwargs):
+        self.table_regions = table_regions
        self.table_areas = table_areas
        self.columns = columns
        self._validate_columns()
@@ -275,7 +280,13 @@ class Stream(BaseParser):

    def _generate_table_bbox(self):
        self.textedges = []
-        if self.table_areas is not None:
+        if self.table_areas is None:
+            if self.table_regions is not None:
+                # filter horizontal text
+                pass
+            # find tables based on nurminen's detection algorithm
+            table_bbox = self._nurminen_table_detection(self.horizontal_text)
+        else:
            table_bbox = {}
            for area in self.table_areas:
                x1, y1, x2, y2 = area.split(",")
@@ -284,9 +295,6 @@ class Stream(BaseParser):
                x2 = float(x2)
                y2 = float(y2)
                table_bbox[(x1, y2, x2, y1)] = None
-        else:
-            # find tables based on nurminen's detection algorithm
-            table_bbox = self._nurminen_table_detection(self.horizontal_text)
        self.table_bbox = table_bbox

    def _generate_columns_and_rows(self, table_idx, tk):
@@ -101,7 +101,7 @@ stream_kwargs = [
 ]
 lattice_kwargs = [
    'process_background',
-    'line_size_scaling',
+    'line_scale',
    'copy_text',
    'shift_text',
    'line_tol',
@@ -339,7 +339,7 @@ def text_in_bbox(bbox, text):
    ----------
    bbox : tuple
        Tuple (x1, y1, x2, y2) representing a bounding box where
-        (x1, y1) -> lb and (x2, y2) -> rt in PDFMiner coordinate
+        (x1, y1) -> lb and (x2, y2) -> rt in the PDF coordinate
        space.
    text : List of PDFMiner text objects.

@@ -434,11 +434,11 @@ You can pass ``row_tol=<+int>`` to group the rows closer together, as shown belo
 Detect short lines
 ------------------

-There might be cases while using :ref:`Lattice <lattice>` when smaller lines don't get detected. The size of the smallest line that gets detected is calculated by dividing the PDF page's dimensions with a scaling factor called ``line_size_scaling``. By default, its value is 15.
+There might be cases while using :ref:`Lattice <lattice>` when smaller lines don't get detected. The size of the smallest line that gets detected is calculated by dividing the PDF page's dimensions with a scaling factor called ``line_scale``. By default, its value is 15.

-As you can guess, the larger the ``line_size_scaling``, the smaller the size of lines getting detected.
+As you can guess, the larger the ``line_scale``, the smaller the size of lines getting detected.

-.. warning:: Making ``line_size_scaling`` very large (>150) will lead to text getting detected as lines.
+.. warning:: Making ``line_scale`` very large (>150) will lead to text getting detected as lines.

 Here's a `PDF <../_static/pdf/short_lines.pdf>`__ where small lines separating the the headers don't get detected with the default value of 15.

@@ -458,11 +458,11 @@ Let's plot the table for this PDF.
    :alt: A plot of the PDF table with short lines
    :align: left

-Clearly, the smaller lines separating the headers, couldn't be detected. Let's try with ``line_size_scaling=40``, and plot the table again.
+Clearly, the smaller lines separating the headers, couldn't be detected. Let's try with ``line_scale=40``, and plot the table again.

 ::

-    >>> tables = camelot.read_pdf('short_lines.pdf', line_size_scaling=40)
+    >>> tables = camelot.read_pdf('short_lines.pdf', line_scale=40)
    >>> camelot.plot(tables[0], kind='grid')
    >>> plt.show()

@@ -511,7 +511,7 @@ We'll use the `PDF <../_static/pdf/short_lines.pdf>`__ from the previous example

 ::

-    >>> tables = camelot.read_pdf('short_lines.pdf', line_size_scaling=40, shift_text=[''])
+    >>> tables = camelot.read_pdf('short_lines.pdf', line_scale=40, shift_text=[''])
    >>> tables[0].df

 .. csv-table::
@@ -532,7 +532,7 @@ No surprises there — it did remain in place (observe the strings "2400" and "A

 ::

-    >>> tables = camelot.read_pdf('short_lines.pdf', line_size_scaling=40, shift_text=['r', 'b'])
+    >>> tables = camelot.read_pdf('short_lines.pdf', line_scale=40, shift_text=['r', 'b'])
    >>> tables[0].df

 .. tip::
@@ -179,7 +179,7 @@ def test_lattice_copy_text():
    df = pd.DataFrame(data_lattice_copy_text)

    filename = os.path.join(testdir, "row_span_1.pdf")
-    tables = camelot.read_pdf(filename, line_size_scaling=60, copy_text="v")
+    tables = camelot.read_pdf(filename, line_scale=60, copy_text="v")
    assert df.equals(tables[0].df)


@@ -189,13 +189,13 @@ def test_lattice_shift_text():
    df_rb = pd.DataFrame(data_lattice_shift_text_right_bottom)

    filename = os.path.join(testdir, "column_span_2.pdf")
-    tables = camelot.read_pdf(filename, line_size_scaling=40)
+    tables = camelot.read_pdf(filename, line_scale=40)
    assert df_lt.equals(tables[0].df)

-    tables = camelot.read_pdf(filename, line_size_scaling=40, shift_text=[''])
+    tables = camelot.read_pdf(filename, line_scale=40, shift_text=[''])
    assert df_disable.equals(tables[0].df)

-    tables = camelot.read_pdf(filename, line_size_scaling=40, shift_text=['r', 'b'])
+    tables = camelot.read_pdf(filename, line_scale=40, shift_text=['r', 'b'])
    assert df_rb.equals(tables[0].df)