Merge 9161ef3822 into 644bbe7c6d
commit
95afdde6e2
|
|
@ -100,6 +100,9 @@ def cli(ctx, *args, **kwargs):
|
||||||
@click.option(
|
@click.option(
|
||||||
"-back", "--process_background", is_flag=True, help="Process background lines."
|
"-back", "--process_background", is_flag=True, help="Process background lines."
|
||||||
)
|
)
|
||||||
|
@click.option(
|
||||||
|
"-color", "--process_color_background", is_flag=True, help="Increase contrast for better background line processing."
|
||||||
|
)
|
||||||
@click.option(
|
@click.option(
|
||||||
"-scale",
|
"-scale",
|
||||||
"--line_scale",
|
"--line_scale",
|
||||||
|
|
|
||||||
|
|
@ -4,7 +4,7 @@ import cv2
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
|
def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2, process_color_background=False, saturation_threshold=5):
|
||||||
"""Thresholds an image using OpenCV's adaptiveThreshold.
|
"""Thresholds an image using OpenCV's adaptiveThreshold.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
|
|
@ -36,6 +36,17 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
|
||||||
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||||||
|
|
||||||
if process_background:
|
if process_background:
|
||||||
|
if process_color_background:
|
||||||
|
hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
|
||||||
|
initial = hsv[:, :, 1]
|
||||||
|
hsv[initial > saturation_threshold, 0] = 0
|
||||||
|
hsv[initial > saturation_threshold, 1] = 255
|
||||||
|
hsv[initial > saturation_threshold, 2] = 0
|
||||||
|
hsv[initial <= saturation_threshold, 0] = 128
|
||||||
|
hsv[initial <= saturation_threshold, 1] = 0
|
||||||
|
hsv[initial <= saturation_threshold, 2] = 255
|
||||||
|
hsv[initial == 255, 1] = 0
|
||||||
|
gray = cv2.cvtColor(hsv, cv2.COLOR_BGR2GRAY)
|
||||||
threshold = cv2.adaptiveThreshold(
|
threshold = cv2.adaptiveThreshold(
|
||||||
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blocksize, c
|
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blocksize, c
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -59,6 +59,8 @@ def read_pdf(
|
||||||
to generate columns.
|
to generate columns.
|
||||||
process_background* : bool, optional (default: False)
|
process_background* : bool, optional (default: False)
|
||||||
Process background lines.
|
Process background lines.
|
||||||
|
process_color_background* : bool, optional (default: False)
|
||||||
|
Increase contrast for better background line processing.
|
||||||
line_scale* : int, optional (default: 15)
|
line_scale* : int, optional (default: 15)
|
||||||
Line size scaling factor. The larger the value the smaller
|
Line size scaling factor. The larger the value the smaller
|
||||||
the detected lines. Making it very large will lead to text
|
the detected lines. Making it very large will lead to text
|
||||||
|
|
|
||||||
|
|
@ -99,6 +99,7 @@ class Lattice(BaseParser):
|
||||||
table_regions=None,
|
table_regions=None,
|
||||||
table_areas=None,
|
table_areas=None,
|
||||||
process_background=False,
|
process_background=False,
|
||||||
|
process_color_background=False,
|
||||||
line_scale=15,
|
line_scale=15,
|
||||||
copy_text=None,
|
copy_text=None,
|
||||||
shift_text=["l", "t"],
|
shift_text=["l", "t"],
|
||||||
|
|
@ -117,6 +118,7 @@ class Lattice(BaseParser):
|
||||||
self.table_regions = table_regions
|
self.table_regions = table_regions
|
||||||
self.table_areas = table_areas
|
self.table_areas = table_areas
|
||||||
self.process_background = process_background
|
self.process_background = process_background
|
||||||
|
self.process_color_background = process_color_background
|
||||||
self.line_scale = line_scale
|
self.line_scale = line_scale
|
||||||
self.copy_text = copy_text
|
self.copy_text = copy_text
|
||||||
self.shift_text = shift_text
|
self.shift_text = shift_text
|
||||||
|
|
@ -255,6 +257,7 @@ class Lattice(BaseParser):
|
||||||
self.image, self.threshold = adaptive_threshold(
|
self.image, self.threshold = adaptive_threshold(
|
||||||
self.imagename,
|
self.imagename,
|
||||||
process_background=self.process_background,
|
process_background=self.process_background,
|
||||||
|
process_color_background=self.process_color_background,
|
||||||
blocksize=self.threshold_blocksize,
|
blocksize=self.threshold_blocksize,
|
||||||
c=self.threshold_constant,
|
c=self.threshold_constant,
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -96,6 +96,7 @@ def download_url(url):
|
||||||
stream_kwargs = ["columns", "edge_tol", "row_tol", "column_tol"]
|
stream_kwargs = ["columns", "edge_tol", "row_tol", "column_tol"]
|
||||||
lattice_kwargs = [
|
lattice_kwargs = [
|
||||||
"process_background",
|
"process_background",
|
||||||
|
"process_color_background",
|
||||||
"line_scale",
|
"line_scale",
|
||||||
"copy_text",
|
"copy_text",
|
||||||
"shift_text",
|
"shift_text",
|
||||||
|
|
|
||||||
|
|
@ -33,6 +33,19 @@ To process background lines, you can pass ``process_background=True``.
|
||||||
.. csv-table::
|
.. csv-table::
|
||||||
:file: ../_static/csv/background_lines.csv
|
:file: ../_static/csv/background_lines.csv
|
||||||
|
|
||||||
|
If there's too little contrast between the table background color and the document background color, you can try combining the experimental option ``process_color_background=True``.
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
>>> tables = camelot.read_pdf('background_lines.pdf', process_background=True, process_color_background=True)
|
||||||
|
>>> tables[1].df
|
||||||
|
|
||||||
|
.. tip::
|
||||||
|
Here's how you can do the same with the :ref:`command-line interface <cli>`.
|
||||||
|
::
|
||||||
|
|
||||||
|
$ camelot lattice -back -color background_lines.pdf
|
||||||
|
|
||||||
Visual debugging
|
Visual debugging
|
||||||
----------------
|
----------------
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue