pull/203/merge
NoReflex 2021-07-15 12:37:12 -04:00 committed by GitHub
commit 95afdde6e2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 34 additions and 1 deletions

View File

@ -100,6 +100,9 @@ def cli(ctx, *args, **kwargs):
@click.option( @click.option(
"-back", "--process_background", is_flag=True, help="Process background lines." "-back", "--process_background", is_flag=True, help="Process background lines."
) )
@click.option(
"-color", "--process_color_background", is_flag=True, help="Increase contrast for better background line processing."
)
@click.option( @click.option(
"-scale", "-scale",
"--line_scale", "--line_scale",

View File

@ -4,7 +4,7 @@ import cv2
import numpy as np import numpy as np
def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2): def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2, process_color_background=False, saturation_threshold=5):
"""Thresholds an image using OpenCV's adaptiveThreshold. """Thresholds an image using OpenCV's adaptiveThreshold.
Parameters Parameters
@ -36,6 +36,17 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
if process_background: if process_background:
if process_color_background:
hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
initial = hsv[:, :, 1]
hsv[initial > saturation_threshold, 0] = 0
hsv[initial > saturation_threshold, 1] = 255
hsv[initial > saturation_threshold, 2] = 0
hsv[initial <= saturation_threshold, 0] = 128
hsv[initial <= saturation_threshold, 1] = 0
hsv[initial <= saturation_threshold, 2] = 255
hsv[initial == 255, 1] = 0
gray = cv2.cvtColor(hsv, cv2.COLOR_BGR2GRAY)
threshold = cv2.adaptiveThreshold( threshold = cv2.adaptiveThreshold(
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blocksize, c gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blocksize, c
) )

View File

@ -59,6 +59,8 @@ def read_pdf(
to generate columns. to generate columns.
process_background* : bool, optional (default: False) process_background* : bool, optional (default: False)
Process background lines. Process background lines.
process_color_background* : bool, optional (default: False)
Increase contrast for better background line processing.
line_scale* : int, optional (default: 15) line_scale* : int, optional (default: 15)
Line size scaling factor. The larger the value the smaller Line size scaling factor. The larger the value the smaller
the detected lines. Making it very large will lead to text the detected lines. Making it very large will lead to text

View File

@ -99,6 +99,7 @@ class Lattice(BaseParser):
table_regions=None, table_regions=None,
table_areas=None, table_areas=None,
process_background=False, process_background=False,
process_color_background=False,
line_scale=15, line_scale=15,
copy_text=None, copy_text=None,
shift_text=["l", "t"], shift_text=["l", "t"],
@ -117,6 +118,7 @@ class Lattice(BaseParser):
self.table_regions = table_regions self.table_regions = table_regions
self.table_areas = table_areas self.table_areas = table_areas
self.process_background = process_background self.process_background = process_background
self.process_color_background = process_color_background
self.line_scale = line_scale self.line_scale = line_scale
self.copy_text = copy_text self.copy_text = copy_text
self.shift_text = shift_text self.shift_text = shift_text
@ -255,6 +257,7 @@ class Lattice(BaseParser):
self.image, self.threshold = adaptive_threshold( self.image, self.threshold = adaptive_threshold(
self.imagename, self.imagename,
process_background=self.process_background, process_background=self.process_background,
process_color_background=self.process_color_background,
blocksize=self.threshold_blocksize, blocksize=self.threshold_blocksize,
c=self.threshold_constant, c=self.threshold_constant,
) )

View File

@ -96,6 +96,7 @@ def download_url(url):
stream_kwargs = ["columns", "edge_tol", "row_tol", "column_tol"] stream_kwargs = ["columns", "edge_tol", "row_tol", "column_tol"]
lattice_kwargs = [ lattice_kwargs = [
"process_background", "process_background",
"process_color_background",
"line_scale", "line_scale",
"copy_text", "copy_text",
"shift_text", "shift_text",

View File

@ -33,6 +33,19 @@ To process background lines, you can pass ``process_background=True``.
.. csv-table:: .. csv-table::
:file: ../_static/csv/background_lines.csv :file: ../_static/csv/background_lines.csv
If there's too little contrast between the table background color and the document background color, you can try combining the experimental option ``process_color_background=True``.
::
>>> tables = camelot.read_pdf('background_lines.pdf', process_background=True, process_color_background=True)
>>> tables[1].df
.. tip::
Here's how you can do the same with the :ref:`command-line interface <cli>`.
::
$ camelot lattice -back -color background_lines.pdf
Visual debugging Visual debugging
---------------- ----------------