diff --git a/HISTORY.md b/HISTORY.md index 6a1e9c4..e4abe83 100755 --- a/HISTORY.md +++ b/HISTORY.md @@ -4,6 +4,13 @@ Release History master ------ +**Improvements** + +* [#240](https://github.com/socialcopsdev/camelot/issues/209) Add support to analyze only certain page regions to look for tables. [#243](https://github.com/socialcopsdev/camelot/pull/243) by Vinayak Mehta. + * You can use `table_regions` in `read_pdf()` to specify approximate page regions which may contain tables. + * Kwarg `line_size_scaling` is now called `line_scale`. +* [#239](https://github.com/socialcopsdev/camelot/issues/239) Raise warning if PDF is image-based. [#240](https://github.com/socialcopsdev/camelot/pull/240) by Vinayak Mehta. + 0.6.0 (2018-12-24) ------------------ diff --git a/camelot/cli.py b/camelot/cli.py index 3f240e0..a2b45a5 100644 --- a/camelot/cli.py +++ b/camelot/cli.py @@ -56,12 +56,15 @@ def cli(ctx, *args, **kwargs): @cli.command('lattice') +@click.option('-R', '--table_regions', default=[], multiple=True, + help='Page regions to analyze. Example: x1,y1,x2,y2' + ' where x1, y1 -> left-top and x2, y2 -> right-bottom.') @click.option('-T', '--table_areas', default=[], multiple=True, help='Table areas to process. Example: x1,y1,x2,y2' ' where x1, y1 -> left-top and x2, y2 -> right-bottom.') @click.option('-back', '--process_background', is_flag=True, help='Process background lines.') -@click.option('-scale', '--line_size_scaling', default=15, +@click.option('-scale', '--line_scale', default=15, help='Line size scaling factor. The larger the value,' ' the smaller the detected lines.') @click.option('-copy', '--copy_text', default=[], type=click.Choice(['h', 'v']), @@ -105,6 +108,8 @@ def lattice(c, *args, **kwargs): filepath = kwargs.pop('filepath') kwargs.update(conf) + table_regions = list(kwargs['table_regions']) + kwargs['table_regions'] = None if not table_regions else table_regions table_areas = list(kwargs['table_areas']) kwargs['table_areas'] = None if not table_areas else table_areas copy_text = list(kwargs['copy_text']) @@ -132,6 +137,9 @@ def lattice(c, *args, **kwargs): @cli.command('stream') +@click.option('-R', '--table_regions', default=[], multiple=True, + help='Page regions to analyze. Example: x1,y1,x2,y2' + ' where x1, y1 -> left-top and x2, y2 -> right-bottom.') @click.option('-T', '--table_areas', default=[], multiple=True, help='Table areas to process. Example: x1,y1,x2,y2' ' where x1, y1 -> left-top and x2, y2 -> right-bottom.') @@ -160,6 +168,8 @@ def stream(c, *args, **kwargs): filepath = kwargs.pop('filepath') kwargs.update(conf) + table_regions = list(kwargs['table_regions']) + kwargs['table_regions'] = None if not table_regions else table_regions table_areas = list(kwargs['table_areas']) kwargs['table_areas'] = None if not table_areas else table_areas columns = list(kwargs['columns']) diff --git a/camelot/image_processing.py b/camelot/image_processing.py index eb23101..3051852 100644 --- a/camelot/image_processing.py +++ b/camelot/image_processing.py @@ -48,7 +48,8 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2): return img, threshold -def find_lines(threshold, direction='horizontal', line_size_scaling=15, iterations=0): +def find_lines(threshold, regions=None, direction='horizontal', + line_scale=15, iterations=0): """Finds horizontal and vertical lines by applying morphological transformations on an image. @@ -56,9 +57,13 @@ def find_lines(threshold, direction='horizontal', line_size_scaling=15, iteratio ---------- threshold : object numpy.ndarray representing the thresholded image. + regions : list, optional (default: None) + List of page regions that may contain tables of the form x1,y1,x2,y2 + where (x1, y1) -> left-top and (x2, y2) -> right-bottom + in image coordinate space. direction : string, optional (default: 'horizontal') Specifies whether to find vertical or horizontal lines. - line_size_scaling : int, optional (default: 15) + line_scale : int, optional (default: 15) Factor by which the page dimensions will be divided to get smallest length of lines that should be detected. @@ -83,26 +88,33 @@ def find_lines(threshold, direction='horizontal', line_size_scaling=15, iteratio lines = [] if direction == 'vertical': - size = threshold.shape[0] // line_size_scaling + size = threshold.shape[0] // line_scale el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size)) elif direction == 'horizontal': - size = threshold.shape[1] // line_size_scaling + size = threshold.shape[1] // line_scale el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1)) elif direction is None: raise ValueError("Specify direction as either 'vertical' or" " 'horizontal'") + if regions is not None: + region_mask = np.zeros(threshold.shape) + for region in regions: + x, y, w, h = region + region_mask[y : y + h, x : x + w] = 1 + threshold = np.multiply(threshold, region_mask) + threshold = cv2.erode(threshold, el) threshold = cv2.dilate(threshold, el) dmask = cv2.dilate(threshold, el, iterations=iterations) try: _, contours, _ = cv2.findContours( - threshold, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) except ValueError: # for opencv backward compatibility contours, _ = cv2.findContours( - threshold, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) for c in contours: x, y, w, h = cv2.boundingRect(c) @@ -116,7 +128,7 @@ def find_lines(threshold, direction='horizontal', line_size_scaling=15, iteratio return dmask, lines -def find_table_contours(vertical, horizontal): +def find_contours(vertical, horizontal): """Finds table boundaries using OpenCV's findContours. Parameters @@ -138,11 +150,12 @@ def find_table_contours(vertical, horizontal): try: __, contours, __ = cv2.findContours( - mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) except ValueError: # for opencv backward compatibility contours, __ = cv2.findContours( - mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + # sort in reverse based on contour area and use first 10 contours contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10] cont = [] @@ -153,7 +166,7 @@ def find_table_contours(vertical, horizontal): return cont -def find_table_joints(contours, vertical, horizontal): +def find_joints(contours, vertical, horizontal): """Finds joints/intersections present inside each table boundary. Parameters @@ -176,18 +189,18 @@ def find_table_joints(contours, vertical, horizontal): and (x2, y2) -> rt in image coordinate space. """ - joints = np.bitwise_and(vertical, horizontal) + joints = np.multiply(vertical, horizontal) tables = {} for c in contours: x, y, w, h = c roi = joints[y : y + h, x : x + w] try: __, jc, __ = cv2.findContours( - roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE) + roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE) except ValueError: # for opencv backward compatibility jc, __ = cv2.findContours( - roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE) + roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE) if len(jc) <= 4: # remove contours with less than 4 joints continue joint_coords = [] diff --git a/camelot/io.py b/camelot/io.py index 44f3354..5162dd2 100644 --- a/camelot/io.py +++ b/camelot/io.py @@ -52,7 +52,7 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice', to generate columns. process_background* : bool, optional (default: False) Process background lines. - line_size_scaling* : int, optional (default: 15) + line_scale* : int, optional (default: 15) Line size scaling factor. The larger the value the smaller the detected lines. Making it very large will lead to text being detected as lines. diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py index f4bc2c0..ab7d3be 100644 --- a/camelot/parsers/lattice.py +++ b/camelot/parsers/lattice.py @@ -16,7 +16,7 @@ from ..utils import (scale_image, scale_pdf, segments_in_bbox, text_in_bbox, merge_close_lines, get_table_index, compute_accuracy, compute_whitespace) from ..image_processing import (adaptive_threshold, find_lines, - find_table_contours, find_table_joints) + find_contours, find_joints) logger = logging.getLogger('camelot') @@ -28,13 +28,17 @@ class Lattice(BaseParser): Parameters ---------- + table_regions : list, optional (default: None) + List of page regions that may contain tables of the form x1,y1,x2,y2 + where (x1, y1) -> left-top and (x2, y2) -> right-bottom + in PDF coordinate space. table_areas : list, optional (default: None) List of table area strings of the form x1,y1,x2,y2 where (x1, y1) -> left-top and (x2, y2) -> right-bottom in PDF coordinate space. process_background : bool, optional (default: False) Process background lines. - line_size_scaling : int, optional (default: 15) + line_scale : int, optional (default: 15) Line size scaling factor. The larger the value the smaller the detected lines. Making it very large will lead to text being detected as lines. @@ -77,14 +81,15 @@ class Lattice(BaseParser): Resolution used for PDF to PNG conversion. """ - def __init__(self, table_areas=None, process_background=False, - line_size_scaling=15, copy_text=None, shift_text=['l', 't'], + def __init__(self, table_regions=None, table_areas=None, process_background=False, + line_scale=15, copy_text=None, shift_text=['l', 't'], split_text=False, flag_size=False, strip_text='', line_tol=2, joint_tol=2, threshold_blocksize=15, threshold_constant=-2, iterations=0, resolution=300, **kwargs): + self.table_regions = table_regions self.table_areas = table_areas self.process_background = process_background - self.line_size_scaling = line_size_scaling + self.line_scale = line_scale self.copy_text = copy_text self.shift_text = shift_text self.split_text = split_text @@ -227,9 +232,22 @@ class Lattice(BaseParser): stderr=subprocess.STDOUT) def _generate_table_bbox(self): + def scale_areas(areas): + scaled_areas = [] + for area in areas: + x1, y1, x2, y2 = area.split(",") + x1 = float(x1) + y1 = float(y1) + x2 = float(x2) + y2 = float(y2) + x1, y1, x2, y2 = scale_pdf((x1, y1, x2, y2), image_scalers) + scaled_areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1))) + return scaled_areas + self.image, self.threshold = adaptive_threshold( self.imagename, process_background=self.process_background, blocksize=self.threshold_blocksize, c=self.threshold_constant) + image_width = self.image.shape[1] image_height = self.image.shape[0] image_width_scaler = image_width / float(self.pdf_width) @@ -239,27 +257,30 @@ class Lattice(BaseParser): image_scalers = (image_width_scaler, image_height_scaler, self.pdf_height) pdf_scalers = (pdf_width_scaler, pdf_height_scaler, image_height) - vertical_mask, vertical_segments = find_lines( - self.threshold, direction='vertical', - line_size_scaling=self.line_size_scaling, iterations=self.iterations) - horizontal_mask, horizontal_segments = find_lines( - self.threshold, direction='horizontal', - line_size_scaling=self.line_size_scaling, iterations=self.iterations) + if self.table_areas is None: + regions = None + if self.table_regions is not None: + regions = scale_areas(self.table_regions) - if self.table_areas is not None: - areas = [] - for area in self.table_areas: - x1, y1, x2, y2 = area.split(",") - x1 = float(x1) - y1 = float(y1) - x2 = float(x2) - y2 = float(y2) - x1, y1, x2, y2 = scale_pdf((x1, y1, x2, y2), image_scalers) - areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1))) - table_bbox = find_table_joints(areas, vertical_mask, horizontal_mask) + vertical_mask, vertical_segments = find_lines( + self.threshold, regions=regions, direction='vertical', + line_scale=self.line_scale, iterations=self.iterations) + horizontal_mask, horizontal_segments = find_lines( + self.threshold, regions=regions, direction='horizontal', + line_scale=self.line_scale, iterations=self.iterations) + + contours = find_contours(vertical_mask, horizontal_mask) + table_bbox = find_joints(contours, vertical_mask, horizontal_mask) else: - contours = find_table_contours(vertical_mask, horizontal_mask) - table_bbox = find_table_joints(contours, vertical_mask, horizontal_mask) + vertical_mask, vertical_segments = find_lines( + self.threshold, direction='vertical', line_scale=self.line_scale, + iterations=self.iterations) + horizontal_mask, horizontal_segments = find_lines( + self.threshold, direction='horizontal', line_scale=self.line_scale, + iterations=self.iterations) + + areas = scale_areas(self.table_areas) + table_bbox = find_joints(areas, vertical_mask, horizontal_mask) self.table_bbox_unscaled = copy.deepcopy(table_bbox) diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index d36212b..1efe144 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -26,6 +26,10 @@ class Stream(BaseParser): Parameters ---------- + table_regions : list, optional (default: None) + List of page regions that may contain tables of the form x1,y1,x2,y2 + where (x1, y1) -> left-top and (x2, y2) -> right-bottom + in PDF coordinate space. table_areas : list, optional (default: None) List of table area strings of the form x1,y1,x2,y2 where (x1, y1) -> left-top and (x2, y2) -> right-bottom @@ -51,9 +55,10 @@ class Stream(BaseParser): to generate columns. """ - def __init__(self, table_areas=None, columns=None, split_text=False, + def __init__(self, table_regions=None, table_areas=None, columns=None, split_text=False, flag_size=False, strip_text='', edge_tol=50, row_tol=2, column_tol=0, **kwargs): + self.table_regions = table_regions self.table_areas = table_areas self.columns = columns self._validate_columns() @@ -275,7 +280,18 @@ class Stream(BaseParser): def _generate_table_bbox(self): self.textedges = [] - if self.table_areas is not None: + if self.table_areas is None: + hor_text = self.horizontal_text + if self.table_regions is not None: + # filter horizontal text + hor_text = [] + for region in self.table_regions: + x1, y1, x2, y2 = region + region_text = text_in_bbox((x1, y2, x2, y1), self.horizontal_text) + hor_text.extend(region_text) + # find tables based on nurminen's detection algorithm + table_bbox = self._nurminen_table_detection(hor_text) + else: table_bbox = {} for area in self.table_areas: x1, y1, x2, y2 = area.split(",") @@ -284,9 +300,6 @@ class Stream(BaseParser): x2 = float(x2) y2 = float(y2) table_bbox[(x1, y2, x2, y1)] = None - else: - # find tables based on nurminen's detection algorithm - table_bbox = self._nurminen_table_detection(self.horizontal_text) self.table_bbox = table_bbox def _generate_columns_and_rows(self, table_idx, tk): diff --git a/camelot/utils.py b/camelot/utils.py index 3b78d5e..7b22307 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -101,7 +101,7 @@ stream_kwargs = [ ] lattice_kwargs = [ 'process_background', - 'line_size_scaling', + 'line_scale', 'copy_text', 'shift_text', 'line_tol', @@ -339,7 +339,7 @@ def text_in_bbox(bbox, text): ---------- bbox : tuple Tuple (x1, y1, x2, y2) representing a bounding box where - (x1, y1) -> lb and (x2, y2) -> rt in PDFMiner coordinate + (x1, y1) -> lb and (x2, y2) -> rt in the PDF coordinate space. text : List of PDFMiner text objects. diff --git a/docs/_static/csv/table_regions.csv b/docs/_static/csv/table_regions.csv new file mode 100644 index 0000000..caf534e --- /dev/null +++ b/docs/_static/csv/table_regions.csv @@ -0,0 +1,4 @@ +"Età dell’Assicuratoall’epoca del decesso","Misura % dimaggiorazione" +"18-75","1,00%" +"76-80","0,50%" +"81 in poi","0,10%" diff --git a/docs/_static/pdf/table_regions.pdf b/docs/_static/pdf/table_regions.pdf new file mode 100644 index 0000000..f6f053b Binary files /dev/null and b/docs/_static/pdf/table_regions.pdf differ diff --git a/docs/user/advanced.rst b/docs/user/advanced.rst index ca40bb8..e7b4ab7 100644 --- a/docs/user/advanced.rst +++ b/docs/user/advanced.rst @@ -206,12 +206,10 @@ You can also visualize the textedges found on a page by specifying ``kind='texte Specify table areas ------------------- -In cases such as `these <../_static/pdf/table_areas.pdf>`__, it can be useful to specify table boundaries. You can plot the text on this page and note the top left and bottom right coordinates of the table. +In cases such as `these <../_static/pdf/table_areas.pdf>`__, it can be useful to specify exact table boundaries. You can plot the text on this page and note the top left and bottom right coordinates of the table. Table areas that you want Camelot to analyze can be passed as a list of comma-separated strings to :meth:`read_pdf() `, using the ``table_areas`` keyword argument. -.. _for now: https://github.com/socialcopsdev/camelot/issues/102 - :: >>> tables = camelot.read_pdf('table_areas.pdf', flavor='stream', table_areas=['316,499,566,337']) @@ -226,6 +224,27 @@ Table areas that you want Camelot to analyze can be passed as a list of comma-se .. csv-table:: :file: ../_static/csv/table_areas.csv +Specify table regions +--------------------- + +However there may be cases like `[1] <../_static/pdf/table_regions.pdf>`__ and `[2] `__, where the table might not lie at the exact coordinates every time but in an approximate region. + +You can use the ``table_regions`` keyword argument to :meth:`read_pdf() ` to solve for such cases. When ``table_regions`` is specified, Camelot will only analyze the specified regions to look for tables. + +:: + + >>> tables = camelot.read_pdf('table_regions.pdf', table_regions=['170,370,560,270']) + >>> tables[0].df + +.. tip:: + Here's how you can do the same with the :ref:`command-line interface `. + :: + + $ camelot lattice -R 170,370,560,270 table_regions.pdf + +.. csv-table:: + :file: ../_static/csv/table_regions.csv + Specify column separators ------------------------- @@ -434,11 +453,11 @@ You can pass ``row_tol=<+int>`` to group the rows closer together, as shown belo Detect short lines ------------------ -There might be cases while using :ref:`Lattice ` when smaller lines don't get detected. The size of the smallest line that gets detected is calculated by dividing the PDF page's dimensions with a scaling factor called ``line_size_scaling``. By default, its value is 15. +There might be cases while using :ref:`Lattice ` when smaller lines don't get detected. The size of the smallest line that gets detected is calculated by dividing the PDF page's dimensions with a scaling factor called ``line_scale``. By default, its value is 15. -As you can guess, the larger the ``line_size_scaling``, the smaller the size of lines getting detected. +As you can guess, the larger the ``line_scale``, the smaller the size of lines getting detected. -.. warning:: Making ``line_size_scaling`` very large (>150) will lead to text getting detected as lines. +.. warning:: Making ``line_scale`` very large (>150) will lead to text getting detected as lines. Here's a `PDF <../_static/pdf/short_lines.pdf>`__ where small lines separating the the headers don't get detected with the default value of 15. @@ -458,11 +477,11 @@ Let's plot the table for this PDF. :alt: A plot of the PDF table with short lines :align: left -Clearly, the smaller lines separating the headers, couldn't be detected. Let's try with ``line_size_scaling=40``, and plot the table again. +Clearly, the smaller lines separating the headers, couldn't be detected. Let's try with ``line_scale=40``, and plot the table again. :: - >>> tables = camelot.read_pdf('short_lines.pdf', line_size_scaling=40) + >>> tables = camelot.read_pdf('short_lines.pdf', line_scale=40) >>> camelot.plot(tables[0], kind='grid') >>> plt.show() @@ -511,7 +530,7 @@ We'll use the `PDF <../_static/pdf/short_lines.pdf>`__ from the previous example :: - >>> tables = camelot.read_pdf('short_lines.pdf', line_size_scaling=40, shift_text=['']) + >>> tables = camelot.read_pdf('short_lines.pdf', line_scale=40, shift_text=['']) >>> tables[0].df .. csv-table:: @@ -532,7 +551,7 @@ No surprises there — it did remain in place (observe the strings "2400" and "A :: - >>> tables = camelot.read_pdf('short_lines.pdf', line_size_scaling=40, shift_text=['r', 'b']) + >>> tables = camelot.read_pdf('short_lines.pdf', line_scale=40, shift_text=['r', 'b']) >>> tables[0].df .. tip:: diff --git a/tests/data.py b/tests/data.py index c223227..99527d5 100755 --- a/tests/data.py +++ b/tests/data.py @@ -427,6 +427,13 @@ data_lattice_two_tables_2 = [ ["Pooled", "23889", "47.7", "1.5", "9.9", "19.9", "17.8", "3.3"] ] +data_lattice_table_regions = [ + ['Età dell’Assicurato \nall’epoca del decesso', 'Misura % di \nmaggiorazione'], + ['18-75', '1,00%'], + ['76-80', '0,50%'], + ['81 in poi', '0,10%'] +] + data_lattice_table_areas = [ ["", "", "", "", "", "", "", "", ""], ["State", "n", "Literacy Status", "", "", "", "", "", ""], diff --git a/tests/files/table_region.pdf b/tests/files/table_region.pdf new file mode 100644 index 0000000..f6f053b Binary files /dev/null and b/tests/files/table_region.pdf differ diff --git a/tests/test_common.py b/tests/test_common.py index f9f26bf..2335060 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -159,6 +159,14 @@ def test_lattice_two_tables(): assert df2.equals(tables[1].df) +def test_lattice_table_regions(): + df = pd.DataFrame(data_lattice_table_regions) + + filename = os.path.join(testdir, "table_region.pdf") + tables = camelot.read_pdf(filename, table_regions=["170,370,560,270"]) + assert df.equals(tables[0].df) + + def test_lattice_table_areas(): df = pd.DataFrame(data_lattice_table_areas) @@ -179,7 +187,7 @@ def test_lattice_copy_text(): df = pd.DataFrame(data_lattice_copy_text) filename = os.path.join(testdir, "row_span_1.pdf") - tables = camelot.read_pdf(filename, line_size_scaling=60, copy_text="v") + tables = camelot.read_pdf(filename, line_scale=60, copy_text="v") assert df.equals(tables[0].df) @@ -189,13 +197,13 @@ def test_lattice_shift_text(): df_rb = pd.DataFrame(data_lattice_shift_text_right_bottom) filename = os.path.join(testdir, "column_span_2.pdf") - tables = camelot.read_pdf(filename, line_size_scaling=40) + tables = camelot.read_pdf(filename, line_scale=40) assert df_lt.equals(tables[0].df) - tables = camelot.read_pdf(filename, line_size_scaling=40, shift_text=['']) + tables = camelot.read_pdf(filename, line_scale=40, shift_text=['']) assert df_disable.equals(tables[0].df) - tables = camelot.read_pdf(filename, line_size_scaling=40, shift_text=['r', 'b']) + tables = camelot.read_pdf(filename, line_scale=40, shift_text=['r', 'b']) assert df_rb.equals(tables[0].df)