From 03f301b25c045941584c62c914987609222dd6a1 Mon Sep 17 00:00:00 2001
From: Vinayak Mehta <vmehta94@gmail.com>
Date: Fri, 4 Jan 2019 19:17:54 +0530
Subject: [PATCH] Add table regions support

---
 camelot/cli.py              | 12 +++++++-
 camelot/image_processing.py | 34 +++++++++++++++--------
 camelot/io.py               |  2 +-
 camelot/parsers/lattice.py  | 55 ++++++++++++++++++++++++++-----------
 camelot/parsers/stream.py   | 18 ++++++++----
 camelot/utils.py            |  4 +--
 docs/user/advanced.rst      | 14 +++++-----
 tests/test_common.py        |  8 +++---
 8 files changed, 100 insertions(+), 47 deletions(-)

diff --git a/camelot/cli.py b/camelot/cli.py
index 3f240e0..a2b45a5 100644
--- a/camelot/cli.py
+++ b/camelot/cli.py
@@ -56,12 +56,15 @@ def cli(ctx, *args, **kwargs):
 
 
 @cli.command('lattice')
+@click.option('-R', '--table_regions', default=[], multiple=True,
+              help='Page regions to analyze. Example: x1,y1,x2,y2'
+              ' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
 @click.option('-T', '--table_areas', default=[], multiple=True,
               help='Table areas to process. Example: x1,y1,x2,y2'
               ' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
 @click.option('-back', '--process_background', is_flag=True,
               help='Process background lines.')
-@click.option('-scale', '--line_size_scaling', default=15,
+@click.option('-scale', '--line_scale', default=15,
               help='Line size scaling factor. The larger the value,'
               ' the smaller the detected lines.')
 @click.option('-copy', '--copy_text', default=[], type=click.Choice(['h', 'v']),
@@ -105,6 +108,8 @@ def lattice(c, *args, **kwargs):
     filepath = kwargs.pop('filepath')
     kwargs.update(conf)
 
+    table_regions = list(kwargs['table_regions'])
+    kwargs['table_regions'] = None if not table_regions else table_regions
     table_areas = list(kwargs['table_areas'])
     kwargs['table_areas'] = None if not table_areas else table_areas
     copy_text = list(kwargs['copy_text'])
@@ -132,6 +137,9 @@ def lattice(c, *args, **kwargs):
 
 
 @cli.command('stream')
+@click.option('-R', '--table_regions', default=[], multiple=True,
+              help='Page regions to analyze. Example: x1,y1,x2,y2'
+              ' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
 @click.option('-T', '--table_areas', default=[], multiple=True,
               help='Table areas to process. Example: x1,y1,x2,y2'
               ' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
@@ -160,6 +168,8 @@ def stream(c, *args, **kwargs):
     filepath = kwargs.pop('filepath')
     kwargs.update(conf)
 
+    table_regions = list(kwargs['table_regions'])
+    kwargs['table_regions'] = None if not table_regions else table_regions
     table_areas = list(kwargs['table_areas'])
     kwargs['table_areas'] = None if not table_areas else table_areas
     columns = list(kwargs['columns'])
diff --git a/camelot/image_processing.py b/camelot/image_processing.py
index eb23101..8707d48 100644
--- a/camelot/image_processing.py
+++ b/camelot/image_processing.py
@@ -48,7 +48,8 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
     return img, threshold
 
 
-def find_lines(threshold, direction='horizontal', line_size_scaling=15, iterations=0):
+def find_lines(threshold, regions=None, direction='horizontal',
+               line_scale=15, iterations=0):
     """Finds horizontal and vertical lines by applying morphological
     transformations on an image.
 
@@ -56,9 +57,13 @@ def find_lines(threshold, direction='horizontal', line_size_scaling=15, iteratio
     ----------
     threshold : object
         numpy.ndarray representing the thresholded image.
+    regions : list, optional (default: None)
+        List of page regions that may contain tables of the form x1,y1,x2,y2
+        where (x1, y1) -> left-top and (x2, y2) -> right-bottom
+        in image coordinate space.
     direction : string, optional (default: 'horizontal')
         Specifies whether to find vertical or horizontal lines.
-    line_size_scaling : int, optional (default: 15)
+    line_scale : int, optional (default: 15)
         Factor by which the page dimensions will be divided to get
         smallest length of lines that should be detected.
 
@@ -83,10 +88,10 @@ def find_lines(threshold, direction='horizontal', line_size_scaling=15, iteratio
     lines = []
 
     if direction == 'vertical':
-        size = threshold.shape[0] // line_size_scaling
+        size = threshold.shape[0] // line_scale
         el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size))
     elif direction == 'horizontal':
-        size = threshold.shape[1] // line_size_scaling
+        size = threshold.shape[1] // line_scale
         el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1))
     elif direction is None:
         raise ValueError("Specify direction as either 'vertical' or"
@@ -112,11 +117,17 @@ def find_lines(threshold, direction='horizontal', line_size_scaling=15, iteratio
             lines.append(((x1 + x2) // 2, y2, (x1 + x2) // 2, y1))
         elif direction == 'horizontal':
             lines.append((x1, (y1 + y2) // 2, x2, (y1 + y2) // 2))
+    if regions is not None:
+        region_mask = np.zeros(dmask.shape)
+        for region in regions:
+            x, y, w, h = region
+            region_mask[y : y + h, x : x + w] = 1
+        dmask = np.multiply(dmask, region_mask)
 
     return dmask, lines
 
 
-def find_table_contours(vertical, horizontal):
+def find_contours(vertical, horizontal):
     """Finds table boundaries using OpenCV's findContours.
 
     Parameters
@@ -138,11 +149,12 @@ def find_table_contours(vertical, horizontal):
 
     try:
         __, contours, __ = cv2.findContours(
-            mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+            mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
     except ValueError:
         # for opencv backward compatibility
         contours, __ = cv2.findContours(
-            mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+            mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    # sort in reverse based on contour area and use first 10 contours
     contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
 
     cont = []
@@ -153,7 +165,7 @@ def find_table_contours(vertical, horizontal):
     return cont
 
 
-def find_table_joints(contours, vertical, horizontal):
+def find_joints(contours, vertical, horizontal):
     """Finds joints/intersections present inside each table boundary.
 
     Parameters
@@ -176,18 +188,18 @@ def find_table_joints(contours, vertical, horizontal):
         and (x2, y2) -> rt in image coordinate space.
 
     """
-    joints = np.bitwise_and(vertical, horizontal)
+    joints = np.multiply(vertical, horizontal)
     tables = {}
     for c in contours:
         x, y, w, h = c
         roi = joints[y : y + h, x : x + w]
         try:
             __, jc, __ = cv2.findContours(
-                roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
+                roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
         except ValueError:
             # for opencv backward compatibility
             jc, __ = cv2.findContours(
-                roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
+                roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
         if len(jc) <= 4:  # remove contours with less than 4 joints
             continue
         joint_coords = []
diff --git a/camelot/io.py b/camelot/io.py
index 44f3354..5162dd2 100644
--- a/camelot/io.py
+++ b/camelot/io.py
@@ -52,7 +52,7 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice',
         to generate columns.
     process_background* : bool, optional (default: False)
         Process background lines.
-    line_size_scaling* : int, optional (default: 15)
+    line_scale* : int, optional (default: 15)
         Line size scaling factor. The larger the value the smaller
         the detected lines. Making it very large will lead to text
         being detected as lines.
diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py
index f4bc2c0..e061f65 100644
--- a/camelot/parsers/lattice.py
+++ b/camelot/parsers/lattice.py
@@ -16,7 +16,7 @@ from ..utils import (scale_image, scale_pdf, segments_in_bbox, text_in_bbox,
                      merge_close_lines, get_table_index, compute_accuracy,
                      compute_whitespace)
 from ..image_processing import (adaptive_threshold, find_lines,
-                                find_table_contours, find_table_joints)
+                                find_contours, find_joints)
 
 
 logger = logging.getLogger('camelot')
@@ -28,13 +28,17 @@ class Lattice(BaseParser):
 
     Parameters
     ----------
+    table_regions : list, optional (default: None)
+        List of page regions that may contain tables of the form x1,y1,x2,y2
+        where (x1, y1) -> left-top and (x2, y2) -> right-bottom
+        in PDF coordinate space.
     table_areas : list, optional (default: None)
         List of table area strings of the form x1,y1,x2,y2
         where (x1, y1) -> left-top and (x2, y2) -> right-bottom
         in PDF coordinate space.
     process_background : bool, optional (default: False)
         Process background lines.
-    line_size_scaling : int, optional (default: 15)
+    line_scale : int, optional (default: 15)
         Line size scaling factor. The larger the value the smaller
         the detected lines. Making it very large will lead to text
         being detected as lines.
@@ -77,14 +81,15 @@ class Lattice(BaseParser):
         Resolution used for PDF to PNG conversion.
 
     """
-    def __init__(self, table_areas=None, process_background=False,
-                 line_size_scaling=15, copy_text=None, shift_text=['l', 't'],
+    def __init__(self, table_regions=None, table_areas=None, process_background=False,
+                 line_scale=15, copy_text=None, shift_text=['l', 't'],
                  split_text=False, flag_size=False, strip_text='', line_tol=2,
                  joint_tol=2, threshold_blocksize=15, threshold_constant=-2,
                  iterations=0, resolution=300, **kwargs):
+        self.table_regions = table_regions
         self.table_areas = table_areas
         self.process_background = process_background
-        self.line_size_scaling = line_size_scaling
+        self.line_scale = line_scale
         self.copy_text = copy_text
         self.shift_text = shift_text
         self.split_text = split_text
@@ -239,14 +244,35 @@ class Lattice(BaseParser):
         image_scalers = (image_width_scaler, image_height_scaler, self.pdf_height)
         pdf_scalers = (pdf_width_scaler, pdf_height_scaler, image_height)
 
-        vertical_mask, vertical_segments = find_lines(
-            self.threshold, direction='vertical',
-            line_size_scaling=self.line_size_scaling, iterations=self.iterations)
-        horizontal_mask, horizontal_segments = find_lines(
-            self.threshold, direction='horizontal',
-            line_size_scaling=self.line_size_scaling, iterations=self.iterations)
+        if self.table_areas is None:
+            regions = None
+            if self.table_regions is not None:
+                regions = []
+                for region in self.table_regions:
+                    x1, y1, x2, y2 = region.split(",")
+                    x1 = float(x1)
+                    y1 = float(y1)
+                    x2 = float(x2)
+                    y2 = float(y2)
+                    x1, y1, x2, y2 = scale_pdf((x1, y1, x2, y2), image_scalers)
+                    regions.append((x1, y1, abs(x2 - x1), abs(y2 - y1)))
+            vertical_mask, vertical_segments = find_lines(
+                self.threshold, regions=regions, direction='vertical',
+                line_scale=self.line_scale, iterations=self.iterations)
+            horizontal_mask, horizontal_segments = find_lines(
+                self.threshold, regions=regions, direction='horizontal',
+                line_scale=self.line_scale, iterations=self.iterations)
+
+            contours = find_contours(vertical_mask, horizontal_mask)
+            table_bbox = find_joints(contours, vertical_mask, horizontal_mask)
+        else:
+            vertical_mask, vertical_segments = find_lines(
+                self.threshold, direction='vertical', line_scale=self.line_scale,
+                iterations=self.iterations)
+            horizontal_mask, horizontal_segments = find_lines(
+                self.threshold, direction='horizontal', line_scale=self.line_scale,
+                iterations=self.iterations)
 
-        if self.table_areas is not None:
             areas = []
             for area in self.table_areas:
                 x1, y1, x2, y2 = area.split(",")
@@ -256,10 +282,7 @@ class Lattice(BaseParser):
                 y2 = float(y2)
                 x1, y1, x2, y2 = scale_pdf((x1, y1, x2, y2), image_scalers)
                 areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1)))
-            table_bbox = find_table_joints(areas, vertical_mask, horizontal_mask)
-        else:
-            contours = find_table_contours(vertical_mask, horizontal_mask)
-            table_bbox = find_table_joints(contours, vertical_mask, horizontal_mask)
+            table_bbox = find_joints(areas, vertical_mask, horizontal_mask)
 
         self.table_bbox_unscaled = copy.deepcopy(table_bbox)
 
diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py
index d36212b..82c5fd1 100644
--- a/camelot/parsers/stream.py
+++ b/camelot/parsers/stream.py
@@ -26,6 +26,10 @@ class Stream(BaseParser):
 
     Parameters
     ----------
+    table_regions : list, optional (default: None)
+        List of page regions that may contain tables of the form x1,y1,x2,y2
+        where (x1, y1) -> left-top and (x2, y2) -> right-bottom
+        in PDF coordinate space.
     table_areas : list, optional (default: None)
         List of table area strings of the form x1,y1,x2,y2
         where (x1, y1) -> left-top and (x2, y2) -> right-bottom
@@ -51,9 +55,10 @@ class Stream(BaseParser):
         to generate columns.
 
     """
-    def __init__(self, table_areas=None, columns=None, split_text=False,
+    def __init__(self, table_regions=None, table_areas=None, columns=None, split_text=False,
                  flag_size=False, strip_text='', edge_tol=50, row_tol=2,
                  column_tol=0, **kwargs):
+        self.table_regions = table_regions
         self.table_areas = table_areas
         self.columns = columns
         self._validate_columns()
@@ -275,7 +280,13 @@ class Stream(BaseParser):
 
     def _generate_table_bbox(self):
         self.textedges = []
-        if self.table_areas is not None:
+        if self.table_areas is None:
+            if self.table_regions is not None:
+                # filter horizontal text
+                pass
+            # find tables based on nurminen's detection algorithm
+            table_bbox = self._nurminen_table_detection(self.horizontal_text)
+        else:
             table_bbox = {}
             for area in self.table_areas:
                 x1, y1, x2, y2 = area.split(",")
@@ -284,9 +295,6 @@ class Stream(BaseParser):
                 x2 = float(x2)
                 y2 = float(y2)
                 table_bbox[(x1, y2, x2, y1)] = None
-        else:
-            # find tables based on nurminen's detection algorithm
-            table_bbox = self._nurminen_table_detection(self.horizontal_text)
         self.table_bbox = table_bbox
 
     def _generate_columns_and_rows(self, table_idx, tk):
diff --git a/camelot/utils.py b/camelot/utils.py
index 3b78d5e..7b22307 100644
--- a/camelot/utils.py
+++ b/camelot/utils.py
@@ -101,7 +101,7 @@ stream_kwargs = [
 ]
 lattice_kwargs = [
     'process_background',
-    'line_size_scaling',
+    'line_scale',
     'copy_text',
     'shift_text',
     'line_tol',
@@ -339,7 +339,7 @@ def text_in_bbox(bbox, text):
     ----------
     bbox : tuple
         Tuple (x1, y1, x2, y2) representing a bounding box where
-        (x1, y1) -> lb and (x2, y2) -> rt in PDFMiner coordinate
+        (x1, y1) -> lb and (x2, y2) -> rt in the PDF coordinate
         space.
     text : List of PDFMiner text objects.
 
diff --git a/docs/user/advanced.rst b/docs/user/advanced.rst
index ca40bb8..f454c1e 100644
--- a/docs/user/advanced.rst
+++ b/docs/user/advanced.rst
@@ -434,11 +434,11 @@ You can pass ``row_tol=<+int>`` to group the rows closer together, as shown belo
 Detect short lines
 ------------------
 
-There might be cases while using :ref:`Lattice <lattice>` when smaller lines don't get detected. The size of the smallest line that gets detected is calculated by dividing the PDF page's dimensions with a scaling factor called ``line_size_scaling``. By default, its value is 15.
+There might be cases while using :ref:`Lattice <lattice>` when smaller lines don't get detected. The size of the smallest line that gets detected is calculated by dividing the PDF page's dimensions with a scaling factor called ``line_scale``. By default, its value is 15.
 
-As you can guess, the larger the ``line_size_scaling``, the smaller the size of lines getting detected.
+As you can guess, the larger the ``line_scale``, the smaller the size of lines getting detected.
 
-.. warning:: Making ``line_size_scaling`` very large (>150) will lead to text getting detected as lines.
+.. warning:: Making ``line_scale`` very large (>150) will lead to text getting detected as lines.
 
 Here's a `PDF <../_static/pdf/short_lines.pdf>`__ where small lines separating the the headers don't get detected with the default value of 15.
 
@@ -458,11 +458,11 @@ Let's plot the table for this PDF.
     :alt: A plot of the PDF table with short lines
     :align: left
 
-Clearly, the smaller lines separating the headers, couldn't be detected. Let's try with ``line_size_scaling=40``, and plot the table again.
+Clearly, the smaller lines separating the headers, couldn't be detected. Let's try with ``line_scale=40``, and plot the table again.
 
 ::
 
-    >>> tables = camelot.read_pdf('short_lines.pdf', line_size_scaling=40)
+    >>> tables = camelot.read_pdf('short_lines.pdf', line_scale=40)
     >>> camelot.plot(tables[0], kind='grid')
     >>> plt.show()
 
@@ -511,7 +511,7 @@ We'll use the `PDF <../_static/pdf/short_lines.pdf>`__ from the previous example
 
 ::
 
-    >>> tables = camelot.read_pdf('short_lines.pdf', line_size_scaling=40, shift_text=[''])
+    >>> tables = camelot.read_pdf('short_lines.pdf', line_scale=40, shift_text=[''])
     >>> tables[0].df
 
 .. csv-table::
@@ -532,7 +532,7 @@ No surprises there — it did remain in place (observe the strings "2400" and "A
 
 ::
 
-    >>> tables = camelot.read_pdf('short_lines.pdf', line_size_scaling=40, shift_text=['r', 'b'])
+    >>> tables = camelot.read_pdf('short_lines.pdf', line_scale=40, shift_text=['r', 'b'])
     >>> tables[0].df
 
 .. tip::
diff --git a/tests/test_common.py b/tests/test_common.py
index f9f26bf..3a24c55 100644
--- a/tests/test_common.py
+++ b/tests/test_common.py
@@ -179,7 +179,7 @@ def test_lattice_copy_text():
     df = pd.DataFrame(data_lattice_copy_text)
 
     filename = os.path.join(testdir, "row_span_1.pdf")
-    tables = camelot.read_pdf(filename, line_size_scaling=60, copy_text="v")
+    tables = camelot.read_pdf(filename, line_scale=60, copy_text="v")
     assert df.equals(tables[0].df)
 
 
@@ -189,13 +189,13 @@ def test_lattice_shift_text():
     df_rb = pd.DataFrame(data_lattice_shift_text_right_bottom)
 
     filename = os.path.join(testdir, "column_span_2.pdf")
-    tables = camelot.read_pdf(filename, line_size_scaling=40)
+    tables = camelot.read_pdf(filename, line_scale=40)
     assert df_lt.equals(tables[0].df)
 
-    tables = camelot.read_pdf(filename, line_size_scaling=40, shift_text=[''])
+    tables = camelot.read_pdf(filename, line_scale=40, shift_text=[''])
     assert df_disable.equals(tables[0].df)
 
-    tables = camelot.read_pdf(filename, line_size_scaling=40, shift_text=['r', 'b'])
+    tables = camelot.read_pdf(filename, line_scale=40, shift_text=['r', 'b'])
     assert df_rb.equals(tables[0].df)