Merge 9161ef3822 into 644bbe7c6d

2021-07-15 12:37:12 -04:00 · 2021-07-15 12:37:12 -04:00 · 95afdde6e2
parent 644bbe7c6d 9161ef3822
commit 95afdde6e2
6 changed files with 34 additions and 1 deletions
--- a/camelot/cli.py
+++ b/camelot/cli.py
@ -100,6 +100,9 @@ def cli(ctx, *args, **kwargs):
@click.option(
    "-back", "--process_background", is_flag=True, help="Process background lines."
 )
@click.option(
    "-color", "--process_color_background", is_flag=True, help="Increase contrast for better background line processing."
 )
@click.option(
    "-scale",
    "--line_scale",
--- a/camelot/image_processing.py
+++ b/camelot/image_processing.py
@ -4,7 +4,7 @@ import cv2
 import numpy as np
-def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
+def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2, process_color_background=False, saturation_threshold=5):
    """Thresholds an image using OpenCV's adaptiveThreshold.
    Parameters
@ -36,6 +36,17 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    if process_background:
        if process_color_background:
            hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
            initial = hsv[:, :, 1]
            hsv[initial > saturation_threshold, 0] = 0
            hsv[initial > saturation_threshold, 1] = 255
            hsv[initial > saturation_threshold, 2] = 0
            hsv[initial <= saturation_threshold, 0] = 128
            hsv[initial <= saturation_threshold, 1] = 0
            hsv[initial <= saturation_threshold, 2] = 255
            hsv[initial == 255, 1] = 0
            gray = cv2.cvtColor(hsv, cv2.COLOR_BGR2GRAY)
        threshold = cv2.adaptiveThreshold(
            gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blocksize, c
        )
--- a/camelot/io.py
+++ b/camelot/io.py
@ -59,6 +59,8 @@ def read_pdf(
        to generate columns.
    process_background* : bool, optional (default: False)
        Process background lines.
    process_color_background* : bool, optional (default: False)
        Increase contrast for better background line processing.
    line_scale* : int, optional (default: 15)
        Line size scaling factor. The larger the value the smaller
        the detected lines. Making it very large will lead to text
--- a/camelot/parsers/lattice.py
+++ b/camelot/parsers/lattice.py
@ -99,6 +99,7 @@ class Lattice(BaseParser):
        table_regions=None,
        table_areas=None,
        process_background=False,
        process_color_background=False,
        line_scale=15,
        copy_text=None,
        shift_text=["l", "t"],
@ -117,6 +118,7 @@ class Lattice(BaseParser):
        self.table_regions = table_regions
        self.table_areas = table_areas
        self.process_background = process_background
        self.process_color_background = process_color_background
        self.line_scale = line_scale
        self.copy_text = copy_text
        self.shift_text = shift_text
@ -255,6 +257,7 @@ class Lattice(BaseParser):
        self.image, self.threshold = adaptive_threshold(
            self.imagename,
            process_background=self.process_background,
            process_color_background=self.process_color_background,
            blocksize=self.threshold_blocksize,
            c=self.threshold_constant,
        )
--- a/camelot/utils.py
+++ b/camelot/utils.py
@ -96,6 +96,7 @@ def download_url(url):
 stream_kwargs = ["columns", "edge_tol", "row_tol", "column_tol"]
 lattice_kwargs = [
    "process_background",
    "process_color_background",
    "line_scale",
    "copy_text",
    "shift_text",
--- a/docs/user/advanced.rst
+++ b/docs/user/advanced.rst
@ -33,6 +33,19 @@ To process background lines, you can pass ``process_background=True``.
 .. csv-table::
  :file: ../_static/csv/background_lines.csv
 If there's too little contrast between the table background color and the document background color, you can try combining the experimental option ``process_color_background=True``.
 ::
    >>> tables = camelot.read_pdf('background_lines.pdf', process_background=True, process_color_background=True)
    >>> tables[1].df
 .. tip::
    Here's how you can do the same with the :ref:`command-line interface <cli>`.
    ::
        $ camelot lattice -back -color background_lines.pdf
 Visual debugging
 ----------------