pull/203/merge
NoReflex 2021-07-15 12:37:12 -04:00 committed by GitHub
commit 95afdde6e2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 34 additions and 1 deletions

View File

@ -100,6 +100,9 @@ def cli(ctx, *args, **kwargs):
@click.option(
"-back", "--process_background", is_flag=True, help="Process background lines."
)
@click.option(
"-color", "--process_color_background", is_flag=True, help="Increase contrast for better background line processing."
)
@click.option(
"-scale",
"--line_scale",

View File

@ -4,7 +4,7 @@ import cv2
import numpy as np
def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2, process_color_background=False, saturation_threshold=5):
"""Thresholds an image using OpenCV's adaptiveThreshold.
Parameters
@ -36,6 +36,17 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
if process_background:
if process_color_background:
hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
initial = hsv[:, :, 1]
hsv[initial > saturation_threshold, 0] = 0
hsv[initial > saturation_threshold, 1] = 255
hsv[initial > saturation_threshold, 2] = 0
hsv[initial <= saturation_threshold, 0] = 128
hsv[initial <= saturation_threshold, 1] = 0
hsv[initial <= saturation_threshold, 2] = 255
hsv[initial == 255, 1] = 0
gray = cv2.cvtColor(hsv, cv2.COLOR_BGR2GRAY)
threshold = cv2.adaptiveThreshold(
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blocksize, c
)

View File

@ -59,6 +59,8 @@ def read_pdf(
to generate columns.
process_background* : bool, optional (default: False)
Process background lines.
process_color_background* : bool, optional (default: False)
Increase contrast for better background line processing.
line_scale* : int, optional (default: 15)
Line size scaling factor. The larger the value the smaller
the detected lines. Making it very large will lead to text

View File

@ -99,6 +99,7 @@ class Lattice(BaseParser):
table_regions=None,
table_areas=None,
process_background=False,
process_color_background=False,
line_scale=15,
copy_text=None,
shift_text=["l", "t"],
@ -117,6 +118,7 @@ class Lattice(BaseParser):
self.table_regions = table_regions
self.table_areas = table_areas
self.process_background = process_background
self.process_color_background = process_color_background
self.line_scale = line_scale
self.copy_text = copy_text
self.shift_text = shift_text
@ -255,6 +257,7 @@ class Lattice(BaseParser):
self.image, self.threshold = adaptive_threshold(
self.imagename,
process_background=self.process_background,
process_color_background=self.process_color_background,
blocksize=self.threshold_blocksize,
c=self.threshold_constant,
)

View File

@ -96,6 +96,7 @@ def download_url(url):
stream_kwargs = ["columns", "edge_tol", "row_tol", "column_tol"]
lattice_kwargs = [
"process_background",
"process_color_background",
"line_scale",
"copy_text",
"shift_text",

View File

@ -33,6 +33,19 @@ To process background lines, you can pass ``process_background=True``.
.. csv-table::
:file: ../_static/csv/background_lines.csv
If there's too little contrast between the table background color and the document background color, you can try combining the experimental option ``process_color_background=True``.
::
>>> tables = camelot.read_pdf('background_lines.pdf', process_background=True, process_color_background=True)
>>> tables[1].df
.. tip::
Here's how you can do the same with the :ref:`command-line interface <cli>`.
::
$ camelot lattice -back -color background_lines.pdf
Visual debugging
----------------