From 03f301b25c045941584c62c914987609222dd6a1 Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Fri, 4 Jan 2019 19:17:54 +0530 Subject: [PATCH] Add table regions support --- camelot/cli.py | 12 +++++++- camelot/image_processing.py | 34 +++++++++++++++-------- camelot/io.py | 2 +- camelot/parsers/lattice.py | 55 ++++++++++++++++++++++++++----------- camelot/parsers/stream.py | 18 ++++++++---- camelot/utils.py | 4 +-- docs/user/advanced.rst | 14 +++++----- tests/test_common.py | 8 +++--- 8 files changed, 100 insertions(+), 47 deletions(-) diff --git a/camelot/cli.py b/camelot/cli.py index 3f240e0..a2b45a5 100644 --- a/camelot/cli.py +++ b/camelot/cli.py @@ -56,12 +56,15 @@ def cli(ctx, *args, **kwargs): @cli.command('lattice') +@click.option('-R', '--table_regions', default=[], multiple=True, + help='Page regions to analyze. Example: x1,y1,x2,y2' + ' where x1, y1 -> left-top and x2, y2 -> right-bottom.') @click.option('-T', '--table_areas', default=[], multiple=True, help='Table areas to process. Example: x1,y1,x2,y2' ' where x1, y1 -> left-top and x2, y2 -> right-bottom.') @click.option('-back', '--process_background', is_flag=True, help='Process background lines.') -@click.option('-scale', '--line_size_scaling', default=15, +@click.option('-scale', '--line_scale', default=15, help='Line size scaling factor. The larger the value,' ' the smaller the detected lines.') @click.option('-copy', '--copy_text', default=[], type=click.Choice(['h', 'v']), @@ -105,6 +108,8 @@ def lattice(c, *args, **kwargs): filepath = kwargs.pop('filepath') kwargs.update(conf) + table_regions = list(kwargs['table_regions']) + kwargs['table_regions'] = None if not table_regions else table_regions table_areas = list(kwargs['table_areas']) kwargs['table_areas'] = None if not table_areas else table_areas copy_text = list(kwargs['copy_text']) @@ -132,6 +137,9 @@ def lattice(c, *args, **kwargs): @cli.command('stream') +@click.option('-R', '--table_regions', default=[], multiple=True, + help='Page regions to analyze. Example: x1,y1,x2,y2' + ' where x1, y1 -> left-top and x2, y2 -> right-bottom.') @click.option('-T', '--table_areas', default=[], multiple=True, help='Table areas to process. Example: x1,y1,x2,y2' ' where x1, y1 -> left-top and x2, y2 -> right-bottom.') @@ -160,6 +168,8 @@ def stream(c, *args, **kwargs): filepath = kwargs.pop('filepath') kwargs.update(conf) + table_regions = list(kwargs['table_regions']) + kwargs['table_regions'] = None if not table_regions else table_regions table_areas = list(kwargs['table_areas']) kwargs['table_areas'] = None if not table_areas else table_areas columns = list(kwargs['columns']) diff --git a/camelot/image_processing.py b/camelot/image_processing.py index eb23101..8707d48 100644 --- a/camelot/image_processing.py +++ b/camelot/image_processing.py @@ -48,7 +48,8 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2): return img, threshold -def find_lines(threshold, direction='horizontal', line_size_scaling=15, iterations=0): +def find_lines(threshold, regions=None, direction='horizontal', + line_scale=15, iterations=0): """Finds horizontal and vertical lines by applying morphological transformations on an image. @@ -56,9 +57,13 @@ def find_lines(threshold, direction='horizontal', line_size_scaling=15, iteratio ---------- threshold : object numpy.ndarray representing the thresholded image. + regions : list, optional (default: None) + List of page regions that may contain tables of the form x1,y1,x2,y2 + where (x1, y1) -> left-top and (x2, y2) -> right-bottom + in image coordinate space. direction : string, optional (default: 'horizontal') Specifies whether to find vertical or horizontal lines. - line_size_scaling : int, optional (default: 15) + line_scale : int, optional (default: 15) Factor by which the page dimensions will be divided to get smallest length of lines that should be detected. @@ -83,10 +88,10 @@ def find_lines(threshold, direction='horizontal', line_size_scaling=15, iteratio lines = [] if direction == 'vertical': - size = threshold.shape[0] // line_size_scaling + size = threshold.shape[0] // line_scale el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size)) elif direction == 'horizontal': - size = threshold.shape[1] // line_size_scaling + size = threshold.shape[1] // line_scale el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1)) elif direction is None: raise ValueError("Specify direction as either 'vertical' or" @@ -112,11 +117,17 @@ def find_lines(threshold, direction='horizontal', line_size_scaling=15, iteratio lines.append(((x1 + x2) // 2, y2, (x1 + x2) // 2, y1)) elif direction == 'horizontal': lines.append((x1, (y1 + y2) // 2, x2, (y1 + y2) // 2)) + if regions is not None: + region_mask = np.zeros(dmask.shape) + for region in regions: + x, y, w, h = region + region_mask[y : y + h, x : x + w] = 1 + dmask = np.multiply(dmask, region_mask) return dmask, lines -def find_table_contours(vertical, horizontal): +def find_contours(vertical, horizontal): """Finds table boundaries using OpenCV's findContours. Parameters @@ -138,11 +149,12 @@ def find_table_contours(vertical, horizontal): try: __, contours, __ = cv2.findContours( - mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) except ValueError: # for opencv backward compatibility contours, __ = cv2.findContours( - mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + # sort in reverse based on contour area and use first 10 contours contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10] cont = [] @@ -153,7 +165,7 @@ def find_table_contours(vertical, horizontal): return cont -def find_table_joints(contours, vertical, horizontal): +def find_joints(contours, vertical, horizontal): """Finds joints/intersections present inside each table boundary. Parameters @@ -176,18 +188,18 @@ def find_table_joints(contours, vertical, horizontal): and (x2, y2) -> rt in image coordinate space. """ - joints = np.bitwise_and(vertical, horizontal) + joints = np.multiply(vertical, horizontal) tables = {} for c in contours: x, y, w, h = c roi = joints[y : y + h, x : x + w] try: __, jc, __ = cv2.findContours( - roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE) + roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE) except ValueError: # for opencv backward compatibility jc, __ = cv2.findContours( - roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE) + roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE) if len(jc) <= 4: # remove contours with less than 4 joints continue joint_coords = [] diff --git a/camelot/io.py b/camelot/io.py index 44f3354..5162dd2 100644 --- a/camelot/io.py +++ b/camelot/io.py @@ -52,7 +52,7 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice', to generate columns. process_background* : bool, optional (default: False) Process background lines. - line_size_scaling* : int, optional (default: 15) + line_scale* : int, optional (default: 15) Line size scaling factor. The larger the value the smaller the detected lines. Making it very large will lead to text being detected as lines. diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py index f4bc2c0..e061f65 100644 --- a/camelot/parsers/lattice.py +++ b/camelot/parsers/lattice.py @@ -16,7 +16,7 @@ from ..utils import (scale_image, scale_pdf, segments_in_bbox, text_in_bbox, merge_close_lines, get_table_index, compute_accuracy, compute_whitespace) from ..image_processing import (adaptive_threshold, find_lines, - find_table_contours, find_table_joints) + find_contours, find_joints) logger = logging.getLogger('camelot') @@ -28,13 +28,17 @@ class Lattice(BaseParser): Parameters ---------- + table_regions : list, optional (default: None) + List of page regions that may contain tables of the form x1,y1,x2,y2 + where (x1, y1) -> left-top and (x2, y2) -> right-bottom + in PDF coordinate space. table_areas : list, optional (default: None) List of table area strings of the form x1,y1,x2,y2 where (x1, y1) -> left-top and (x2, y2) -> right-bottom in PDF coordinate space. process_background : bool, optional (default: False) Process background lines. - line_size_scaling : int, optional (default: 15) + line_scale : int, optional (default: 15) Line size scaling factor. The larger the value the smaller the detected lines. Making it very large will lead to text being detected as lines. @@ -77,14 +81,15 @@ class Lattice(BaseParser): Resolution used for PDF to PNG conversion. """ - def __init__(self, table_areas=None, process_background=False, - line_size_scaling=15, copy_text=None, shift_text=['l', 't'], + def __init__(self, table_regions=None, table_areas=None, process_background=False, + line_scale=15, copy_text=None, shift_text=['l', 't'], split_text=False, flag_size=False, strip_text='', line_tol=2, joint_tol=2, threshold_blocksize=15, threshold_constant=-2, iterations=0, resolution=300, **kwargs): + self.table_regions = table_regions self.table_areas = table_areas self.process_background = process_background - self.line_size_scaling = line_size_scaling + self.line_scale = line_scale self.copy_text = copy_text self.shift_text = shift_text self.split_text = split_text @@ -239,14 +244,35 @@ class Lattice(BaseParser): image_scalers = (image_width_scaler, image_height_scaler, self.pdf_height) pdf_scalers = (pdf_width_scaler, pdf_height_scaler, image_height) - vertical_mask, vertical_segments = find_lines( - self.threshold, direction='vertical', - line_size_scaling=self.line_size_scaling, iterations=self.iterations) - horizontal_mask, horizontal_segments = find_lines( - self.threshold, direction='horizontal', - line_size_scaling=self.line_size_scaling, iterations=self.iterations) + if self.table_areas is None: + regions = None + if self.table_regions is not None: + regions = [] + for region in self.table_regions: + x1, y1, x2, y2 = region.split(",") + x1 = float(x1) + y1 = float(y1) + x2 = float(x2) + y2 = float(y2) + x1, y1, x2, y2 = scale_pdf((x1, y1, x2, y2), image_scalers) + regions.append((x1, y1, abs(x2 - x1), abs(y2 - y1))) + vertical_mask, vertical_segments = find_lines( + self.threshold, regions=regions, direction='vertical', + line_scale=self.line_scale, iterations=self.iterations) + horizontal_mask, horizontal_segments = find_lines( + self.threshold, regions=regions, direction='horizontal', + line_scale=self.line_scale, iterations=self.iterations) + + contours = find_contours(vertical_mask, horizontal_mask) + table_bbox = find_joints(contours, vertical_mask, horizontal_mask) + else: + vertical_mask, vertical_segments = find_lines( + self.threshold, direction='vertical', line_scale=self.line_scale, + iterations=self.iterations) + horizontal_mask, horizontal_segments = find_lines( + self.threshold, direction='horizontal', line_scale=self.line_scale, + iterations=self.iterations) - if self.table_areas is not None: areas = [] for area in self.table_areas: x1, y1, x2, y2 = area.split(",") @@ -256,10 +282,7 @@ class Lattice(BaseParser): y2 = float(y2) x1, y1, x2, y2 = scale_pdf((x1, y1, x2, y2), image_scalers) areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1))) - table_bbox = find_table_joints(areas, vertical_mask, horizontal_mask) - else: - contours = find_table_contours(vertical_mask, horizontal_mask) - table_bbox = find_table_joints(contours, vertical_mask, horizontal_mask) + table_bbox = find_joints(areas, vertical_mask, horizontal_mask) self.table_bbox_unscaled = copy.deepcopy(table_bbox) diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index d36212b..82c5fd1 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -26,6 +26,10 @@ class Stream(BaseParser): Parameters ---------- + table_regions : list, optional (default: None) + List of page regions that may contain tables of the form x1,y1,x2,y2 + where (x1, y1) -> left-top and (x2, y2) -> right-bottom + in PDF coordinate space. table_areas : list, optional (default: None) List of table area strings of the form x1,y1,x2,y2 where (x1, y1) -> left-top and (x2, y2) -> right-bottom @@ -51,9 +55,10 @@ class Stream(BaseParser): to generate columns. """ - def __init__(self, table_areas=None, columns=None, split_text=False, + def __init__(self, table_regions=None, table_areas=None, columns=None, split_text=False, flag_size=False, strip_text='', edge_tol=50, row_tol=2, column_tol=0, **kwargs): + self.table_regions = table_regions self.table_areas = table_areas self.columns = columns self._validate_columns() @@ -275,7 +280,13 @@ class Stream(BaseParser): def _generate_table_bbox(self): self.textedges = [] - if self.table_areas is not None: + if self.table_areas is None: + if self.table_regions is not None: + # filter horizontal text + pass + # find tables based on nurminen's detection algorithm + table_bbox = self._nurminen_table_detection(self.horizontal_text) + else: table_bbox = {} for area in self.table_areas: x1, y1, x2, y2 = area.split(",") @@ -284,9 +295,6 @@ class Stream(BaseParser): x2 = float(x2) y2 = float(y2) table_bbox[(x1, y2, x2, y1)] = None - else: - # find tables based on nurminen's detection algorithm - table_bbox = self._nurminen_table_detection(self.horizontal_text) self.table_bbox = table_bbox def _generate_columns_and_rows(self, table_idx, tk): diff --git a/camelot/utils.py b/camelot/utils.py index 3b78d5e..7b22307 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -101,7 +101,7 @@ stream_kwargs = [ ] lattice_kwargs = [ 'process_background', - 'line_size_scaling', + 'line_scale', 'copy_text', 'shift_text', 'line_tol', @@ -339,7 +339,7 @@ def text_in_bbox(bbox, text): ---------- bbox : tuple Tuple (x1, y1, x2, y2) representing a bounding box where - (x1, y1) -> lb and (x2, y2) -> rt in PDFMiner coordinate + (x1, y1) -> lb and (x2, y2) -> rt in the PDF coordinate space. text : List of PDFMiner text objects. diff --git a/docs/user/advanced.rst b/docs/user/advanced.rst index ca40bb8..f454c1e 100644 --- a/docs/user/advanced.rst +++ b/docs/user/advanced.rst @@ -434,11 +434,11 @@ You can pass ``row_tol=<+int>`` to group the rows closer together, as shown belo Detect short lines ------------------ -There might be cases while using :ref:`Lattice ` when smaller lines don't get detected. The size of the smallest line that gets detected is calculated by dividing the PDF page's dimensions with a scaling factor called ``line_size_scaling``. By default, its value is 15. +There might be cases while using :ref:`Lattice ` when smaller lines don't get detected. The size of the smallest line that gets detected is calculated by dividing the PDF page's dimensions with a scaling factor called ``line_scale``. By default, its value is 15. -As you can guess, the larger the ``line_size_scaling``, the smaller the size of lines getting detected. +As you can guess, the larger the ``line_scale``, the smaller the size of lines getting detected. -.. warning:: Making ``line_size_scaling`` very large (>150) will lead to text getting detected as lines. +.. warning:: Making ``line_scale`` very large (>150) will lead to text getting detected as lines. Here's a `PDF <../_static/pdf/short_lines.pdf>`__ where small lines separating the the headers don't get detected with the default value of 15. @@ -458,11 +458,11 @@ Let's plot the table for this PDF. :alt: A plot of the PDF table with short lines :align: left -Clearly, the smaller lines separating the headers, couldn't be detected. Let's try with ``line_size_scaling=40``, and plot the table again. +Clearly, the smaller lines separating the headers, couldn't be detected. Let's try with ``line_scale=40``, and plot the table again. :: - >>> tables = camelot.read_pdf('short_lines.pdf', line_size_scaling=40) + >>> tables = camelot.read_pdf('short_lines.pdf', line_scale=40) >>> camelot.plot(tables[0], kind='grid') >>> plt.show() @@ -511,7 +511,7 @@ We'll use the `PDF <../_static/pdf/short_lines.pdf>`__ from the previous example :: - >>> tables = camelot.read_pdf('short_lines.pdf', line_size_scaling=40, shift_text=['']) + >>> tables = camelot.read_pdf('short_lines.pdf', line_scale=40, shift_text=['']) >>> tables[0].df .. csv-table:: @@ -532,7 +532,7 @@ No surprises there — it did remain in place (observe the strings "2400" and "A :: - >>> tables = camelot.read_pdf('short_lines.pdf', line_size_scaling=40, shift_text=['r', 'b']) + >>> tables = camelot.read_pdf('short_lines.pdf', line_scale=40, shift_text=['r', 'b']) >>> tables[0].df .. tip:: diff --git a/tests/test_common.py b/tests/test_common.py index f9f26bf..3a24c55 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -179,7 +179,7 @@ def test_lattice_copy_text(): df = pd.DataFrame(data_lattice_copy_text) filename = os.path.join(testdir, "row_span_1.pdf") - tables = camelot.read_pdf(filename, line_size_scaling=60, copy_text="v") + tables = camelot.read_pdf(filename, line_scale=60, copy_text="v") assert df.equals(tables[0].df) @@ -189,13 +189,13 @@ def test_lattice_shift_text(): df_rb = pd.DataFrame(data_lattice_shift_text_right_bottom) filename = os.path.join(testdir, "column_span_2.pdf") - tables = camelot.read_pdf(filename, line_size_scaling=40) + tables = camelot.read_pdf(filename, line_scale=40) assert df_lt.equals(tables[0].df) - tables = camelot.read_pdf(filename, line_size_scaling=40, shift_text=['']) + tables = camelot.read_pdf(filename, line_scale=40, shift_text=['']) assert df_disable.equals(tables[0].df) - tables = camelot.read_pdf(filename, line_size_scaling=40, shift_text=['r', 'b']) + tables = camelot.read_pdf(filename, line_scale=40, shift_text=['r', 'b']) assert df_rb.equals(tables[0].df)