[ENH] Add OCR and better joint detection
* Add iterations for dilation * Add OCRLattice and OCRStream * Add debugpull/2/head
parent
dd909e2b53
commit
4da754ddcb
|
|
@ -1,3 +1,6 @@
|
||||||
|
from itertools import groupby
|
||||||
|
from operator import itemgetter
|
||||||
|
|
||||||
import cv2
|
import cv2
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
|
@ -44,7 +47,7 @@ def adaptive_threshold(imagename, invert=False, blocksize=15, c=-2):
|
||||||
return img, threshold
|
return img, threshold
|
||||||
|
|
||||||
|
|
||||||
def find_lines(threshold, direction='horizontal', scale=15):
|
def find_lines(threshold, direction='horizontal', scale=15, iterations=2):
|
||||||
"""Finds horizontal and vertical lines by applying morphological
|
"""Finds horizontal and vertical lines by applying morphological
|
||||||
transformations on an image.
|
transformations on an image.
|
||||||
|
|
||||||
|
|
@ -62,6 +65,10 @@ def find_lines(threshold, direction='horizontal', scale=15):
|
||||||
for morph transform.
|
for morph transform.
|
||||||
(optional, default: 15)
|
(optional, default: 15)
|
||||||
|
|
||||||
|
iterations : int
|
||||||
|
Number of iterations for dilation.
|
||||||
|
(optional, default: 2)
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
dmask : object
|
dmask : object
|
||||||
|
|
@ -85,10 +92,9 @@ def find_lines(threshold, direction='horizontal', scale=15):
|
||||||
raise ValueError("Specify direction as either 'vertical' or"
|
raise ValueError("Specify direction as either 'vertical' or"
|
||||||
" 'horizontal'")
|
" 'horizontal'")
|
||||||
|
|
||||||
threshold = cv2.erode(threshold, el, (-1, -1))
|
threshold = cv2.erode(threshold, el)
|
||||||
threshold = cv2.dilate(threshold, el, (-1, -1))
|
threshold = cv2.dilate(threshold, el)
|
||||||
|
dmask = cv2.dilate(threshold, el, iterations=iterations)
|
||||||
dmask = threshold # findContours modifies source image
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
_, contours, _ = cv2.findContours(
|
_, contours, _ = cv2.findContours(
|
||||||
|
|
@ -190,4 +196,33 @@ def find_table_joints(contours, vertical, horizontal):
|
||||||
joint_coords.append((c1, c2))
|
joint_coords.append((c1, c2))
|
||||||
tables[(x, y + h, x + w, y)] = joint_coords
|
tables[(x, y + h, x + w, y)] = joint_coords
|
||||||
|
|
||||||
return tables
|
return tables
|
||||||
|
|
||||||
|
|
||||||
|
def find_cuts(threshold, line_threshold=100):
|
||||||
|
"""find_cuts
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
threshold : object
|
||||||
|
numpy.ndarray representing the thresholded image.
|
||||||
|
|
||||||
|
line_threshold : int
|
||||||
|
Maximum intensity of projections on y-axis.
|
||||||
|
(optional, default: 100)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
y_cuts : list
|
||||||
|
List of cuts on y-axis.
|
||||||
|
"""
|
||||||
|
y_proj = np.sum(threshold, axis=1)
|
||||||
|
y_proj_less = np.where(y_proj < line_threshold)[0]
|
||||||
|
ranges = []
|
||||||
|
for k, g in groupby(enumerate(y_proj_less), lambda (i, x): i-x):
|
||||||
|
group = map(itemgetter(1), g)
|
||||||
|
ranges.append((group[0], group[-1]))
|
||||||
|
y_cuts = []
|
||||||
|
for r in ranges:
|
||||||
|
y_cuts.append((r[0] + r[1]) / 2)
|
||||||
|
return sorted(y_cuts, reverse=True)
|
||||||
|
|
@ -12,7 +12,7 @@ from .imgproc import (adaptive_threshold, find_lines, find_table_contours,
|
||||||
from .table import Table
|
from .table import Table
|
||||||
from .utils import (scale_to_pdf, scale_to_image, segments_bbox, text_in_bbox,
|
from .utils import (scale_to_pdf, scale_to_image, segments_bbox, text_in_bbox,
|
||||||
merge_close_values, get_table_index, get_score, count_empty,
|
merge_close_values, get_table_index, get_score, count_empty,
|
||||||
encode_list, get_text_objects, get_page_layout)
|
encode_list, get_text_objects, get_page_layout, remove_empty)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ['Lattice']
|
__all__ = ['Lattice']
|
||||||
|
|
@ -131,20 +131,20 @@ class Lattice:
|
||||||
direction.
|
direction.
|
||||||
(optional, default: None)
|
(optional, default: None)
|
||||||
|
|
||||||
headers : list
|
|
||||||
List of strings where each string is a csv header for a table.
|
|
||||||
(optional, default: None)
|
|
||||||
|
|
||||||
mtol : list
|
mtol : list
|
||||||
List of ints specifying m-tolerance parameters.
|
List of ints specifying m-tolerance parameters.
|
||||||
(optional, default: [2])
|
(optional, default: [2])
|
||||||
|
|
||||||
blocksize: int
|
jtol : list
|
||||||
|
List of ints specifying j-tolerance parameters.
|
||||||
|
(optional, default: [2])
|
||||||
|
|
||||||
|
blocksize : int
|
||||||
Size of a pixel neighborhood that is used to calculate a
|
Size of a pixel neighborhood that is used to calculate a
|
||||||
threshold value for the pixel: 3, 5, 7, and so on.
|
threshold value for the pixel: 3, 5, 7, and so on.
|
||||||
(optional, default: 15)
|
(optional, default: 15)
|
||||||
|
|
||||||
threshold_constant: float
|
threshold_constant : float
|
||||||
Constant subtracted from the mean or weighted mean
|
Constant subtracted from the mean or weighted mean
|
||||||
(see the details below). Normally, it is positive but may be
|
(see the details below). Normally, it is positive but may be
|
||||||
zero or negative as well.
|
zero or negative as well.
|
||||||
|
|
@ -155,6 +155,10 @@ class Lattice:
|
||||||
element for image processing.
|
element for image processing.
|
||||||
(optional, default: 15)
|
(optional, default: 15)
|
||||||
|
|
||||||
|
iterations : int
|
||||||
|
Number of iterations for dilation.
|
||||||
|
(optional, default: 2)
|
||||||
|
|
||||||
invert : bool
|
invert : bool
|
||||||
Whether or not to invert the image. Useful when pdfs have
|
Whether or not to invert the image. Useful when pdfs have
|
||||||
tables with lines in background.
|
tables with lines in background.
|
||||||
|
|
@ -187,19 +191,20 @@ class Lattice:
|
||||||
of detected contours, lines, joints and the table generated.
|
of detected contours, lines, joints and the table generated.
|
||||||
(optional, default: None)
|
(optional, default: None)
|
||||||
"""
|
"""
|
||||||
def __init__(self, table_area=None, fill=None, headers=None, mtol=[2],
|
def __init__(self, table_area=None, fill=None, mtol=[2], jtol=[2],
|
||||||
blocksize=15, threshold_constant=-2, scale=15, invert=False,
|
blocksize=15, threshold_constant=-2, scale=15, iterations=2,
|
||||||
margins=(1.0, 0.5, 0.1), split_text=False, flag_size=True,
|
invert=False, margins=(1.0, 0.5, 0.1), split_text=False,
|
||||||
shift_text=['l', 't'], debug=None):
|
flag_size=True, shift_text=['l', 't'], debug=None):
|
||||||
|
|
||||||
self.method = 'lattice'
|
self.method = 'lattice'
|
||||||
self.table_area = table_area
|
self.table_area = table_area
|
||||||
self.fill = fill
|
self.fill = fill
|
||||||
self.headers = headers
|
|
||||||
self.mtol = mtol
|
self.mtol = mtol
|
||||||
|
self.jtol = jtol
|
||||||
self.blocksize = blocksize
|
self.blocksize = blocksize
|
||||||
self.threshold_constant = threshold_constant
|
self.threshold_constant = threshold_constant
|
||||||
self.scale = scale
|
self.scale = scale
|
||||||
|
self.iterations = iterations
|
||||||
self.invert = invert
|
self.invert = invert
|
||||||
self.char_margin, self.line_margin, self.word_margin = margins
|
self.char_margin, self.line_margin, self.word_margin = margins
|
||||||
self.split_text = split_text
|
self.split_text = split_text
|
||||||
|
|
@ -257,17 +262,14 @@ class Lattice:
|
||||||
factors_pdf = (sc_x_pdf, sc_y_pdf, img_y)
|
factors_pdf = (sc_x_pdf, sc_y_pdf, img_y)
|
||||||
|
|
||||||
vmask, v_segments = find_lines(threshold, direction='vertical',
|
vmask, v_segments = find_lines(threshold, direction='vertical',
|
||||||
scale=self.scale)
|
scale=self.scale, iterations=self.iterations)
|
||||||
hmask, h_segments = find_lines(threshold, direction='horizontal',
|
hmask, h_segments = find_lines(threshold, direction='horizontal',
|
||||||
scale=self.scale)
|
scale=self.scale, iterations=self.iterations)
|
||||||
|
|
||||||
if self.table_area is not None:
|
if self.table_area is not None:
|
||||||
if self.fill is not None:
|
if self.fill is not None:
|
||||||
if len(self.table_area) != len(self.fill):
|
if len(self.table_area) != len(self.fill):
|
||||||
raise ValueError("Length of fill should be equal to table_area.")
|
raise ValueError("Length of table area and fill should be equal.")
|
||||||
if self.headers is not None:
|
|
||||||
if len(self.table_area) != len(self.headers):
|
|
||||||
raise ValueError("Length of headers should be equal to table_area.")
|
|
||||||
|
|
||||||
areas = []
|
areas = []
|
||||||
for area in self.table_area:
|
for area in self.table_area:
|
||||||
|
|
@ -288,6 +290,11 @@ class Lattice:
|
||||||
else:
|
else:
|
||||||
mtolerance = copy.deepcopy(self.mtol)
|
mtolerance = copy.deepcopy(self.mtol)
|
||||||
|
|
||||||
|
if len(self.jtol) == 1 and self.jtol[0] == 2:
|
||||||
|
jtolerance = copy.deepcopy(self.jtol) * len(table_bbox)
|
||||||
|
else:
|
||||||
|
jtolerance = copy.deepcopy(self.jtol)
|
||||||
|
|
||||||
if self.debug:
|
if self.debug:
|
||||||
self.debug_images = (img, table_bbox)
|
self.debug_images = (img, table_bbox)
|
||||||
|
|
||||||
|
|
@ -326,18 +333,9 @@ class Lattice:
|
||||||
rows = [(rows[i], rows[i + 1])
|
rows = [(rows[i], rows[i + 1])
|
||||||
for i in range(0, len(rows) - 1)]
|
for i in range(0, len(rows) - 1)]
|
||||||
|
|
||||||
if self.headers is not None and self.headers[table_no] != [""]:
|
|
||||||
self.headers[table_no] = self.headers[table_no].split(',')
|
|
||||||
if len(self.headers[table_no]) != len(cols):
|
|
||||||
logger.warning("Length of header ({0}) specified for table is not"
|
|
||||||
" equal to the number of columns ({1}) detected.".format(
|
|
||||||
len(self.headers[table_no]), len(cols)))
|
|
||||||
while len(self.headers[table_no]) != len(cols):
|
|
||||||
self.headers[table_no].append('')
|
|
||||||
|
|
||||||
table = Table(cols, rows)
|
table = Table(cols, rows)
|
||||||
# set table edges to True using ver+hor lines
|
# set table edges to True using ver+hor lines
|
||||||
table = table.set_edges(v_s, h_s)
|
table = table.set_edges(v_s, h_s, jtol=jtolerance[table_no])
|
||||||
nouse = table.nocont_ / (len(v_s) + len(h_s))
|
nouse = table.nocont_ / (len(v_s) + len(h_s))
|
||||||
table_data['line_p'] = 100 * (1 - nouse)
|
table_data['line_p'] = 100 * (1 - nouse)
|
||||||
# set spanning cells to True
|
# set spanning cells to True
|
||||||
|
|
@ -351,27 +349,27 @@ class Lattice:
|
||||||
assignment_errors = []
|
assignment_errors = []
|
||||||
table_data['split_text'] = []
|
table_data['split_text'] = []
|
||||||
table_data['superscript'] = []
|
table_data['superscript'] = []
|
||||||
for direction in t_bbox:
|
for direction in ['vertical', 'horizontal']:
|
||||||
for t in t_bbox[direction]:
|
for t in t_bbox[direction]:
|
||||||
indices, error = get_table_index(
|
indices, error = get_table_index(
|
||||||
table, t, direction, split_text=self.split_text,
|
table, t, direction, split_text=self.split_text,
|
||||||
flag_size=self.flag_size)
|
flag_size=self.flag_size)
|
||||||
assignment_errors.append(error)
|
if indices[:2] != (-1, -1):
|
||||||
indices = _reduce_index(table, indices, shift_text=self.shift_text,)
|
assignment_errors.append(error)
|
||||||
if len(indices) > 1:
|
indices = _reduce_index(table, indices, shift_text=self.shift_text)
|
||||||
table_data['split_text'].append(indices)
|
if len(indices) > 1:
|
||||||
for r_idx, c_idx, text in indices:
|
table_data['split_text'].append(indices)
|
||||||
if all(s in text for s in ['<s>', '</s>']):
|
for r_idx, c_idx, text in indices:
|
||||||
table_data['superscript'].append((r_idx, c_idx, text))
|
if all(s in text for s in ['<s>', '</s>']):
|
||||||
table.cells[r_idx][c_idx].add_text(text)
|
table_data['superscript'].append((r_idx, c_idx, text))
|
||||||
|
table.cells[r_idx][c_idx].add_text(text)
|
||||||
score = get_score([[100, assignment_errors]])
|
score = get_score([[100, assignment_errors]])
|
||||||
table_data['score'] = score
|
table_data['score'] = score
|
||||||
|
|
||||||
if self.fill is not None:
|
if self.fill is not None:
|
||||||
table = _fill_spanning(table, fill=self.fill[table_no])
|
table = _fill_spanning(table, fill=self.fill[table_no])
|
||||||
ar = table.get_list()
|
ar = table.get_list()
|
||||||
if self.headers is not None and self.headers[table_no] != ['']:
|
ar = remove_empty(ar)
|
||||||
ar.insert(0, self.headers[table_no])
|
|
||||||
ar = encode_list(ar)
|
ar = encode_list(ar)
|
||||||
table_data['data'] = ar
|
table_data['data'] = ar
|
||||||
empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)
|
empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)
|
||||||
|
|
|
||||||
168
camelot/ocr.py
168
camelot/ocr.py
|
|
@ -7,19 +7,18 @@ from PIL import Image
|
||||||
|
|
||||||
from .table import Table
|
from .table import Table
|
||||||
from .imgproc import (adaptive_threshold, find_lines, find_table_contours,
|
from .imgproc import (adaptive_threshold, find_lines, find_table_contours,
|
||||||
find_table_joints)
|
find_table_joints, find_cuts)
|
||||||
from .utils import merge_close_values, encode_list
|
from .utils import merge_close_values, encode_list, remove_empty
|
||||||
|
|
||||||
|
|
||||||
class OCR:
|
class OCRLattice:
|
||||||
"""Uses optical character recognition to get text out of image based pdfs.
|
"""Lattice, but for images.
|
||||||
Currently works only on pdfs with lines.
|
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
table_area : list
|
table_area : list
|
||||||
List of strings of the form x1,y1,x2,y2 where
|
List of strings of the form x1,y1,x2,y2 where
|
||||||
(x1, y1) -> left-top and (x2, y2) -> right-bottom in PDFMiner's
|
(x1, y1) -> left-top and (x2, y2) -> right-bottom in OpenCV's
|
||||||
coordinate space, denoting table areas to analyze.
|
coordinate space, denoting table areas to analyze.
|
||||||
(optional, default: None)
|
(optional, default: None)
|
||||||
|
|
||||||
|
|
@ -27,12 +26,12 @@ class OCR:
|
||||||
List of ints specifying m-tolerance parameters.
|
List of ints specifying m-tolerance parameters.
|
||||||
(optional, default: [2])
|
(optional, default: [2])
|
||||||
|
|
||||||
blocksize: int
|
blocksize : int
|
||||||
Size of a pixel neighborhood that is used to calculate a
|
Size of a pixel neighborhood that is used to calculate a
|
||||||
threshold value for the pixel: 3, 5, 7, and so on.
|
threshold value for the pixel: 3, 5, 7, and so on.
|
||||||
(optional, default: 15)
|
(optional, default: 15)
|
||||||
|
|
||||||
threshold_constant: float
|
threshold_constant : float
|
||||||
Constant subtracted from the mean or weighted mean
|
Constant subtracted from the mean or weighted mean
|
||||||
(see the details below). Normally, it is positive but may be
|
(see the details below). Normally, it is positive but may be
|
||||||
zero or negative as well.
|
zero or negative as well.
|
||||||
|
|
@ -51,6 +50,10 @@ class OCR:
|
||||||
element for image processing.
|
element for image processing.
|
||||||
(optional, default: 15)
|
(optional, default: 15)
|
||||||
|
|
||||||
|
iterations : int
|
||||||
|
Number of iterations for dilation.
|
||||||
|
(optional, default: 2)
|
||||||
|
|
||||||
debug : string
|
debug : string
|
||||||
{'contour', 'line', 'joint', 'table'}
|
{'contour', 'line', 'joint', 'table'}
|
||||||
Set to one of the above values to generate a matplotlib plot
|
Set to one of the above values to generate a matplotlib plot
|
||||||
|
|
@ -58,9 +61,9 @@ class OCR:
|
||||||
(optional, default: None)
|
(optional, default: None)
|
||||||
"""
|
"""
|
||||||
def __init__(self, table_area=None, mtol=[2], blocksize=15, threshold_constant=-2,
|
def __init__(self, table_area=None, mtol=[2], blocksize=15, threshold_constant=-2,
|
||||||
dpi=300, lang="eng", scale=15, debug=None):
|
dpi=300, lang="eng", scale=15, iterations=2, debug=None):
|
||||||
|
|
||||||
self.method = 'ocr'
|
self.method = 'ocrl'
|
||||||
self.table_area = table_area
|
self.table_area = table_area
|
||||||
self.mtol = mtol
|
self.mtol = mtol
|
||||||
self.blocksize = blocksize
|
self.blocksize = blocksize
|
||||||
|
|
@ -69,11 +72,13 @@ class OCR:
|
||||||
self.dpi = dpi
|
self.dpi = dpi
|
||||||
self.lang = lang
|
self.lang = lang
|
||||||
self.scale = scale
|
self.scale = scale
|
||||||
|
self.iterations = iterations
|
||||||
self.debug = debug
|
self.debug = debug
|
||||||
|
|
||||||
def get_tables(self, pdfname):
|
def get_tables(self, pdfname):
|
||||||
if self.tool is None:
|
if self.tool is None:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
bname, __ = os.path.splitext(pdfname)
|
bname, __ = os.path.splitext(pdfname)
|
||||||
imagename = ''.join([bname, '.png'])
|
imagename = ''.join([bname, '.png'])
|
||||||
|
|
||||||
|
|
@ -91,9 +96,9 @@ class OCR:
|
||||||
img, threshold = adaptive_threshold(imagename, blocksize=self.blocksize,
|
img, threshold = adaptive_threshold(imagename, blocksize=self.blocksize,
|
||||||
c=self.threshold_constant)
|
c=self.threshold_constant)
|
||||||
vmask, v_segments = find_lines(threshold, direction='vertical',
|
vmask, v_segments = find_lines(threshold, direction='vertical',
|
||||||
scale=self.scale)
|
scale=self.scale, iterations=self.iterations)
|
||||||
hmask, h_segments = find_lines(threshold, direction='horizontal',
|
hmask, h_segments = find_lines(threshold, direction='horizontal',
|
||||||
scale=self.scale)
|
scale=self.scale, iterations=self.iterations)
|
||||||
|
|
||||||
if self.table_area is not None:
|
if self.table_area is not None:
|
||||||
areas = []
|
areas = []
|
||||||
|
|
@ -154,6 +159,7 @@ class OCR:
|
||||||
ar = table.get_list()
|
ar = table.get_list()
|
||||||
ar.reverse()
|
ar.reverse()
|
||||||
ar = encode_list(ar)
|
ar = encode_list(ar)
|
||||||
|
ar = remove_empty(ar)
|
||||||
table_data['data'] = ar
|
table_data['data'] = ar
|
||||||
tables['table-{0}'.format(table_no + 1)] = table_data
|
tables['table-{0}'.format(table_no + 1)] = table_data
|
||||||
table_no += 1
|
table_no += 1
|
||||||
|
|
@ -162,4 +168,142 @@ class OCR:
|
||||||
if self.debug:
|
if self.debug:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
class OCRStream:
|
||||||
|
"""Stream, but for images.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
table_area : list
|
||||||
|
List of strings of the form x1,y1,x2,y2 where
|
||||||
|
(x1, y1) -> left-top and (x2, y2) -> right-bottom in OpenCV's
|
||||||
|
coordinate space, denoting table areas to analyze.
|
||||||
|
(optional, default: None)
|
||||||
|
|
||||||
|
columns : list
|
||||||
|
List of strings where each string is comma-separated values of
|
||||||
|
x-coordinates in OpenCV's coordinate space.
|
||||||
|
(optional, default: None)
|
||||||
|
|
||||||
|
blocksize : int
|
||||||
|
Size of a pixel neighborhood that is used to calculate a
|
||||||
|
threshold value for the pixel: 3, 5, 7, and so on.
|
||||||
|
(optional, default: 15)
|
||||||
|
|
||||||
|
threshold_constant : float
|
||||||
|
Constant subtracted from the mean or weighted mean
|
||||||
|
(see the details below). Normally, it is positive but may be
|
||||||
|
zero or negative as well.
|
||||||
|
(optional, default: -2)
|
||||||
|
|
||||||
|
line_threshold : int
|
||||||
|
Maximum intensity of projections on y-axis.
|
||||||
|
(optional, default: 100)
|
||||||
|
|
||||||
|
dpi : int
|
||||||
|
Dots per inch.
|
||||||
|
(optional, default: 300)
|
||||||
|
|
||||||
|
lang : string
|
||||||
|
Language to be used for OCR.
|
||||||
|
(optional, default: 'eng')
|
||||||
|
"""
|
||||||
|
def __init__(self, table_area=None, columns=None, blocksize=15,
|
||||||
|
threshold_constant=-2, line_threshold=100, dpi=300, lang="eng",
|
||||||
|
debug=False):
|
||||||
|
|
||||||
|
self.method = 'ocrs'
|
||||||
|
self.table_area = table_area
|
||||||
|
self.columns = columns
|
||||||
|
self.blocksize = blocksize
|
||||||
|
self.threshold_constant = threshold_constant
|
||||||
|
self.line_threshold = line_threshold
|
||||||
|
self.tool = pyocr.get_available_tools()[0] # fix this
|
||||||
|
self.dpi = dpi
|
||||||
|
self.lang = lang
|
||||||
|
self.debug = debug
|
||||||
|
|
||||||
|
def get_tables(self, pdfname):
|
||||||
|
if self.tool is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
bname, __ = os.path.splitext(pdfname)
|
||||||
|
imagename = ''.join([bname, '.png'])
|
||||||
|
|
||||||
|
gs_call = [
|
||||||
|
"-q", "-sDEVICE=png16m", "-o", imagename, "-r{0}".format(self.dpi),
|
||||||
|
pdfname
|
||||||
|
]
|
||||||
|
if "ghostscript" in subprocess.check_output(["gs", "-version"]).lower():
|
||||||
|
gs_call.insert(0, "gs")
|
||||||
|
else:
|
||||||
|
gs_call.insert(0, "gsc")
|
||||||
|
subprocess.call(gs_call, stdout=open(os.devnull, 'w'),
|
||||||
|
stderr=subprocess.STDOUT)
|
||||||
|
|
||||||
|
img, threshold = adaptive_threshold(imagename, blocksize=self.blocksize,
|
||||||
|
c=self.threshold_constant)
|
||||||
|
height, width = threshold.shape
|
||||||
|
if self.debug:
|
||||||
|
self.debug_images = img
|
||||||
|
return None
|
||||||
|
|
||||||
|
if self.table_area is not None:
|
||||||
|
if self.columns is not None:
|
||||||
|
if len(self.table_area) != len(self.columns):
|
||||||
|
raise ValueError("Length of table area and columns should be equal.")
|
||||||
|
|
||||||
|
table_bbox = {}
|
||||||
|
for area in self.table_area:
|
||||||
|
x1, y1, x2, y2 = area.split(",")
|
||||||
|
x1 = int(x1)
|
||||||
|
y1 = int(y1)
|
||||||
|
x2 = int(x2)
|
||||||
|
y2 = int(y2)
|
||||||
|
table_bbox[(x1, y1, x2, y2)] = None
|
||||||
|
else:
|
||||||
|
table_bbox = {(0, 0, width, height): None}
|
||||||
|
|
||||||
|
page = {}
|
||||||
|
tables = {}
|
||||||
|
table_no = 0
|
||||||
|
for k in sorted(table_bbox.keys(), key=lambda x: x[1]):
|
||||||
|
if self.columns is None:
|
||||||
|
raise NotImplementedError
|
||||||
|
else:
|
||||||
|
table_data = {}
|
||||||
|
table_image = threshold[k[1]:k[3],k[0]:k[2]]
|
||||||
|
cols = self.columns[table_no].split(',')
|
||||||
|
cols = [float(c) for c in cols]
|
||||||
|
cols.insert(0, k[0])
|
||||||
|
cols.append(k[2])
|
||||||
|
cols = [(cols[i] - k[0], cols[i + 1] - k[0]) for i in range(0, len(cols) - 1)]
|
||||||
|
y_cuts = find_cuts(table_image, line_threshold=self.line_threshold)
|
||||||
|
rows = [(y_cuts[i], y_cuts[i + 1]) for i in range(0, len(y_cuts) - 1)]
|
||||||
|
table = Table(cols, rows)
|
||||||
|
for i in range(len(table.cells)):
|
||||||
|
for j in range(len(table.cells[i])):
|
||||||
|
x1 = int(table.cells[i][j].x1)
|
||||||
|
y1 = int(table.cells[i][j].y1)
|
||||||
|
x2 = int(table.cells[i][j].x2)
|
||||||
|
y2 = int(table.cells[i][j].y2)
|
||||||
|
table.cells[i][j].image = table_image[y1:y2,x1:x2]
|
||||||
|
cell_image = Image.fromarray(table.cells[i][j].image)
|
||||||
|
text = self.tool.image_to_string(
|
||||||
|
cell_image,
|
||||||
|
lang=self.lang,
|
||||||
|
builder=pyocr.builders.TextBuilder()
|
||||||
|
)
|
||||||
|
table.cells[i][j].add_text(text)
|
||||||
|
ar = table.get_list()
|
||||||
|
ar.reverse()
|
||||||
|
ar = encode_list(ar)
|
||||||
|
ar = remove_empty(ar)
|
||||||
|
table_data['data'] = ar
|
||||||
|
tables['table-{0}'.format(table_no + 1)] = table_data
|
||||||
|
table_no += 1
|
||||||
|
page[os.path.basename(bname)] = tables
|
||||||
|
|
||||||
return page
|
return page
|
||||||
|
|
@ -141,11 +141,14 @@ class Pdf:
|
||||||
if self.extractor.method == 'stream':
|
if self.extractor.method == 'stream':
|
||||||
self.debug = self.extractor.debug
|
self.debug = self.extractor.debug
|
||||||
self.debug_text = []
|
self.debug_text = []
|
||||||
elif self.extractor.method in ['lattice', 'ocr']:
|
elif self.extractor.method in ['lattice', 'ocrl']:
|
||||||
self.debug = self.extractor.debug
|
self.debug = self.extractor.debug
|
||||||
self.debug_images = []
|
self.debug_images = []
|
||||||
self.debug_segments = []
|
self.debug_segments = []
|
||||||
self.debug_tables = []
|
self.debug_tables = []
|
||||||
|
elif self.extractor.method == 'ocrs':
|
||||||
|
self.debug = self.extractor.debug
|
||||||
|
self.debug_images = []
|
||||||
for p in pages:
|
for p in pages:
|
||||||
table = self.extractor.get_tables(p)
|
table = self.extractor.get_tables(p)
|
||||||
if table is not None:
|
if table is not None:
|
||||||
|
|
@ -157,6 +160,8 @@ class Pdf:
|
||||||
self.debug_images.append(self.extractor.debug_images)
|
self.debug_images.append(self.extractor.debug_images)
|
||||||
self.debug_segments.append(self.extractor.debug_segments)
|
self.debug_segments.append(self.extractor.debug_segments)
|
||||||
self.debug_tables.append(self.extractor.debug_tables)
|
self.debug_tables.append(self.extractor.debug_tables)
|
||||||
|
elif self.extractor.method == 'ocrs':
|
||||||
|
self.debug_images.append(self.extractor.debug_images)
|
||||||
if self.clean:
|
if self.clean:
|
||||||
self.remove_tempdir()
|
self.remove_tempdir()
|
||||||
return tables
|
return tables
|
||||||
|
|
@ -175,7 +180,7 @@ class Pdf:
|
||||||
import matplotlib.patches as patches
|
import matplotlib.patches as patches
|
||||||
|
|
||||||
if self.debug is True:
|
if self.debug is True:
|
||||||
try:
|
if hasattr(self, 'debug_text'):
|
||||||
for text in self.debug_text:
|
for text in self.debug_text:
|
||||||
fig = plt.figure()
|
fig = plt.figure()
|
||||||
ax = fig.add_subplot(111, aspect='equal')
|
ax = fig.add_subplot(111, aspect='equal')
|
||||||
|
|
@ -193,8 +198,10 @@ class Pdf:
|
||||||
ax.set_xlim(min(xs) - 10, max(xs) + 10)
|
ax.set_xlim(min(xs) - 10, max(xs) + 10)
|
||||||
ax.set_ylim(min(ys) - 10, max(ys) + 10)
|
ax.set_ylim(min(ys) - 10, max(ys) + 10)
|
||||||
plt.show()
|
plt.show()
|
||||||
except AttributeError:
|
elif hasattr(self, 'debug_images'):
|
||||||
raise ValueError("This option only be used with Stream.")
|
for img in self.debug_images:
|
||||||
|
plt.imshow(img)
|
||||||
|
plt.show()
|
||||||
elif self.debug == 'contour':
|
elif self.debug == 'contour':
|
||||||
try:
|
try:
|
||||||
for img, table_bbox in self.debug_images:
|
for img, table_bbox in self.debug_images:
|
||||||
|
|
|
||||||
|
|
@ -236,10 +236,6 @@ class Stream:
|
||||||
x-coordinates in PDFMiner's coordinate space.
|
x-coordinates in PDFMiner's coordinate space.
|
||||||
(optional, default: None)
|
(optional, default: None)
|
||||||
|
|
||||||
headers : list
|
|
||||||
List of strings where each string is a csv header for a table.
|
|
||||||
(optional, default: None)
|
|
||||||
|
|
||||||
ytol : list
|
ytol : list
|
||||||
List of ints specifying the y-tolerance parameters.
|
List of ints specifying the y-tolerance parameters.
|
||||||
(optional, default: [2])
|
(optional, default: [2])
|
||||||
|
|
@ -268,14 +264,13 @@ class Stream:
|
||||||
LTTextLineHorizontals in order to select table_area, columns.
|
LTTextLineHorizontals in order to select table_area, columns.
|
||||||
(optional, default: False)
|
(optional, default: False)
|
||||||
"""
|
"""
|
||||||
def __init__(self, table_area=None, columns=None, headers=None,
|
def __init__(self, table_area=None, columns=None, ytol=[2], mtol=[0],
|
||||||
ytol=[2], mtol=[0], margins=(1.0, 0.5, 0.1),
|
margins=(1.0, 0.5, 0.1), split_text=False, flag_size=True,
|
||||||
split_text=False, flag_size=True, debug=False):
|
debug=False):
|
||||||
|
|
||||||
self.method = 'stream'
|
self.method = 'stream'
|
||||||
self.table_area = table_area
|
self.table_area = table_area
|
||||||
self.columns = columns
|
self.columns = columns
|
||||||
self.headers = headers
|
|
||||||
self.ytol = ytol
|
self.ytol = ytol
|
||||||
self.mtol = mtol
|
self.mtol = mtol
|
||||||
self.char_margin, self.line_margin, self.word_margin = margins
|
self.char_margin, self.line_margin, self.word_margin = margins
|
||||||
|
|
@ -312,14 +307,12 @@ class Stream:
|
||||||
self.debug_text = []
|
self.debug_text = []
|
||||||
self.debug_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlh])
|
self.debug_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlh])
|
||||||
self.debug_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlv])
|
self.debug_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlv])
|
||||||
|
return None
|
||||||
|
|
||||||
if self.table_area is not None:
|
if self.table_area is not None:
|
||||||
if self.columns is not None:
|
if self.columns is not None:
|
||||||
if len(self.table_area) != len(self.columns):
|
if len(self.table_area) != len(self.columns):
|
||||||
raise ValueError("Length of columns should be equal to table_area.")
|
raise ValueError("Length of table area and columns should be equal.")
|
||||||
if self.headers is not None:
|
|
||||||
if len(self.table_area) != len(self.headers):
|
|
||||||
raise ValueError("Length of headers should be equal to table_area.")
|
|
||||||
|
|
||||||
table_bbox = {}
|
table_bbox = {}
|
||||||
for area in self.table_area:
|
for area in self.table_area:
|
||||||
|
|
@ -336,6 +329,7 @@ class Stream:
|
||||||
ytolerance = copy.deepcopy(self.ytol) * len(table_bbox)
|
ytolerance = copy.deepcopy(self.ytol) * len(table_bbox)
|
||||||
else:
|
else:
|
||||||
ytolerance = copy.deepcopy(self.ytol)
|
ytolerance = copy.deepcopy(self.ytol)
|
||||||
|
|
||||||
if len(self.mtol) == 1 and self.mtol[0] == 0:
|
if len(self.mtol) == 1 and self.mtol[0] == 0:
|
||||||
mtolerance = copy.deepcopy(self.mtol) * len(table_bbox)
|
mtolerance = copy.deepcopy(self.mtol) * len(table_bbox)
|
||||||
else:
|
else:
|
||||||
|
|
@ -374,7 +368,7 @@ class Stream:
|
||||||
guess = True
|
guess = True
|
||||||
ncols = max(set(elements), key=elements.count)
|
ncols = max(set(elements), key=elements.count)
|
||||||
len_non_mode = len(filter(lambda x: x != ncols, elements))
|
len_non_mode = len(filter(lambda x: x != ncols, elements))
|
||||||
if ncols == 1 and not self.debug:
|
if ncols == 1:
|
||||||
# no tables detected
|
# no tables detected
|
||||||
logger.warning("{}: Only one column was detected, the pdf"
|
logger.warning("{}: Only one column was detected, the pdf"
|
||||||
" may have no tables.".format(
|
" may have no tables.".format(
|
||||||
|
|
@ -396,15 +390,6 @@ class Stream:
|
||||||
cols = _add_columns(cols, inner_text, ytolerance[table_no])
|
cols = _add_columns(cols, inner_text, ytolerance[table_no])
|
||||||
cols = _join_columns(cols, text_x_min, text_x_max)
|
cols = _join_columns(cols, text_x_min, text_x_max)
|
||||||
|
|
||||||
if self.headers is not None and self.headers[table_no] != [""]:
|
|
||||||
self.headers[table_no] = self.headers[table_no].split(',')
|
|
||||||
if len(self.headers[table_no]) != len(cols):
|
|
||||||
logger.warning("Length of header ({0}) specified for table is not"
|
|
||||||
" equal to the number of columns ({1}) detected.".format(
|
|
||||||
len(self.headers[table_no]), len(cols)))
|
|
||||||
while len(self.headers[table_no]) != len(cols):
|
|
||||||
self.headers[table_no].append('')
|
|
||||||
|
|
||||||
table = Table(cols, rows)
|
table = Table(cols, rows)
|
||||||
table = table.set_all_edges()
|
table = table.set_all_edges()
|
||||||
assignment_errors = []
|
assignment_errors = []
|
||||||
|
|
@ -429,8 +414,6 @@ class Stream:
|
||||||
|
|
||||||
table_data['score'] = score
|
table_data['score'] = score
|
||||||
ar = table.get_list()
|
ar = table.get_list()
|
||||||
if self.headers is not None and self.headers[table_no] != ['']:
|
|
||||||
ar.insert(0, self.headers[table_no])
|
|
||||||
ar = encode_list(ar)
|
ar = encode_list(ar)
|
||||||
table_data['data'] = ar
|
table_data['data'] = ar
|
||||||
empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)
|
empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)
|
||||||
|
|
|
||||||
|
|
@ -188,38 +188,32 @@ class Table:
|
||||||
bound = self.cells[r][c].get_bounded_edges()
|
bound = self.cells[r][c].get_bounded_edges()
|
||||||
if bound == 4:
|
if bound == 4:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
elif bound == 3:
|
elif bound == 3:
|
||||||
if not self.cells[r][c].left:
|
if not self.cells[r][c].left:
|
||||||
if (self.cells[r][c].right and
|
if (self.cells[r][c].right and
|
||||||
self.cells[r][c].top and
|
self.cells[r][c].top and
|
||||||
self.cells[r][c].bottom):
|
self.cells[r][c].bottom):
|
||||||
self.cells[r][c].spanning_h = True
|
self.cells[r][c].spanning_h = True
|
||||||
|
|
||||||
elif not self.cells[r][c].right:
|
elif not self.cells[r][c].right:
|
||||||
if (self.cells[r][c].left and
|
if (self.cells[r][c].left and
|
||||||
self.cells[r][c].top and
|
self.cells[r][c].top and
|
||||||
self.cells[r][c].bottom):
|
self.cells[r][c].bottom):
|
||||||
self.cells[r][c].spanning_h = True
|
self.cells[r][c].spanning_h = True
|
||||||
|
|
||||||
elif not self.cells[r][c].top:
|
elif not self.cells[r][c].top:
|
||||||
if (self.cells[r][c].left and
|
if (self.cells[r][c].left and
|
||||||
self.cells[r][c].right and
|
self.cells[r][c].right and
|
||||||
self.cells[r][c].bottom):
|
self.cells[r][c].bottom):
|
||||||
self.cells[r][c].spanning_v = True
|
self.cells[r][c].spanning_v = True
|
||||||
|
|
||||||
elif not self.cells[r][c].bottom:
|
elif not self.cells[r][c].bottom:
|
||||||
if (self.cells[r][c].left and
|
if (self.cells[r][c].left and
|
||||||
self.cells[r][c].right and
|
self.cells[r][c].right and
|
||||||
self.cells[r][c].top):
|
self.cells[r][c].top):
|
||||||
self.cells[r][c].spanning_v = True
|
self.cells[r][c].spanning_v = True
|
||||||
|
|
||||||
elif bound == 2:
|
elif bound == 2:
|
||||||
if self.cells[r][c].left and self.cells[r][c].right:
|
if self.cells[r][c].left and self.cells[r][c].right:
|
||||||
if (not self.cells[r][c].top and
|
if (not self.cells[r][c].top and
|
||||||
not self.cells[r][c].bottom):
|
not self.cells[r][c].bottom):
|
||||||
self.cells[r][c].spanning_v = True
|
self.cells[r][c].spanning_v = True
|
||||||
|
|
||||||
elif self.cells[r][c].top and self.cells[r][c].bottom:
|
elif self.cells[r][c].top and self.cells[r][c].bottom:
|
||||||
if (not self.cells[r][c].left and
|
if (not self.cells[r][c].left and
|
||||||
not self.cells[r][c].right):
|
not self.cells[r][c].right):
|
||||||
|
|
|
||||||
|
|
@ -426,40 +426,43 @@ def split_textline(table, textline, direction, flag_size=True):
|
||||||
idx = 0
|
idx = 0
|
||||||
cut_text = []
|
cut_text = []
|
||||||
bbox = textline.bbox
|
bbox = textline.bbox
|
||||||
if direction == 'horizontal' and not textline.is_empty():
|
try:
|
||||||
x_overlap = [i for i, x in enumerate(table.cols) if x[0] <= bbox[2] and bbox[0] <= x[1]]
|
if direction == 'horizontal' and not textline.is_empty():
|
||||||
r_idx = [j for j, r in enumerate(table.rows) if r[1] <= (bbox[1] + bbox[3]) / 2 <= r[0]]
|
x_overlap = [i for i, x in enumerate(table.cols) if x[0] <= bbox[2] and bbox[0] <= x[1]]
|
||||||
r = r_idx[0]
|
r_idx = [j for j, r in enumerate(table.rows) if r[1] <= (bbox[1] + bbox[3]) / 2 <= r[0]]
|
||||||
x_cuts = [(c, table.cells[r][c].x2) for c in x_overlap if table.cells[r][c].right]
|
r = r_idx[0]
|
||||||
if not x_cuts:
|
x_cuts = [(c, table.cells[r][c].x2) for c in x_overlap if table.cells[r][c].right]
|
||||||
x_cuts = [(x_overlap[0], table.cells[r][-1].x2)]
|
if not x_cuts:
|
||||||
for obj in textline._objs:
|
x_cuts = [(x_overlap[0], table.cells[r][-1].x2)]
|
||||||
row = table.rows[r]
|
for obj in textline._objs:
|
||||||
for cut in x_cuts:
|
row = table.rows[r]
|
||||||
if isinstance(obj, LTChar):
|
for cut in x_cuts:
|
||||||
if (row[1] <= (obj.y0 + obj.y1) / 2 <= row[0] and
|
if isinstance(obj, LTChar):
|
||||||
(obj.x0 + obj.x1) / 2 <= cut[1]):
|
if (row[1] <= (obj.y0 + obj.y1) / 2 <= row[0] and
|
||||||
|
(obj.x0 + obj.x1) / 2 <= cut[1]):
|
||||||
|
cut_text.append((r, cut[0], obj))
|
||||||
|
break
|
||||||
|
elif isinstance(obj, LTAnno):
|
||||||
cut_text.append((r, cut[0], obj))
|
cut_text.append((r, cut[0], obj))
|
||||||
break
|
elif direction == 'vertical' and not textline.is_empty():
|
||||||
elif isinstance(obj, LTAnno):
|
y_overlap = [j for j, y in enumerate(table.rows) if y[1] <= bbox[3] and bbox[1] <= y[0]]
|
||||||
cut_text.append((r, cut[0], obj))
|
c_idx = [i for i, c in enumerate(table.cols) if c[0] <= (bbox[0] + bbox[2]) / 2 <= c[1]]
|
||||||
elif direction == 'vertical' and not textline.is_empty():
|
c = c_idx[0]
|
||||||
y_overlap = [j for j, y in enumerate(table.rows) if y[1] <= bbox[3] and bbox[1] <= y[0]]
|
y_cuts = [(r, table.cells[r][c].y1) for r in y_overlap if table.cells[r][c].bottom]
|
||||||
c_idx = [i for i, c in enumerate(table.cols) if c[0] <= (bbox[0] + bbox[2]) / 2 <= c[1]]
|
if not y_cuts:
|
||||||
c = c_idx[0]
|
y_cuts = [(y_overlap[0], table.cells[-1][c].y1)]
|
||||||
y_cuts = [(r, table.cells[r][c].y1) for r in y_overlap if table.cells[r][c].bottom]
|
for obj in textline._objs:
|
||||||
if not y_cuts:
|
col = table.cols[c]
|
||||||
y_cuts = [(y_overlap[0], table.cells[-1][c].y1)]
|
for cut in y_cuts:
|
||||||
for obj in textline._objs:
|
if isinstance(obj, LTChar):
|
||||||
col = table.cols[c]
|
if (col[0] <= (obj.x0 + obj.x1) / 2 <= col[1] and
|
||||||
for cut in y_cuts:
|
(obj.y0 + obj.y1) / 2 >= cut[1]):
|
||||||
if isinstance(obj, LTChar):
|
cut_text.append((cut[0], c, obj))
|
||||||
if (col[0] <= (obj.x0 + obj.x1) / 2 <= col[1] and
|
break
|
||||||
(obj.y0 + obj.y1) / 2 >= cut[1]):
|
elif isinstance(obj, LTAnno):
|
||||||
cut_text.append((cut[0], c, obj))
|
cut_text.append((cut[0], c, obj))
|
||||||
break
|
except IndexError:
|
||||||
elif isinstance(obj, LTAnno):
|
return [(-1, -1, textline.get_text())]
|
||||||
cut_text.append((cut[0], c, obj))
|
|
||||||
grouped_chars = []
|
grouped_chars = []
|
||||||
for key, chars in groupby(cut_text, itemgetter(0, 1)):
|
for key, chars in groupby(cut_text, itemgetter(0, 1)):
|
||||||
if flag_size:
|
if flag_size:
|
||||||
|
|
|
||||||
157
tools/camelot
157
tools/camelot
|
|
@ -18,7 +18,7 @@ from PyPDF2 import PdfFileReader
|
||||||
from camelot.pdf import Pdf
|
from camelot.pdf import Pdf
|
||||||
from camelot.lattice import Lattice
|
from camelot.lattice import Lattice
|
||||||
from camelot.stream import Stream
|
from camelot.stream import Stream
|
||||||
from camelot.ocr import OCR
|
from camelot.ocr import OCRLattice, OCRStream
|
||||||
from camelot import utils
|
from camelot import utils
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -54,7 +54,8 @@ options:
|
||||||
camelot methods:
|
camelot methods:
|
||||||
lattice Looks for lines between data.
|
lattice Looks for lines between data.
|
||||||
stream Looks for spaces between data.
|
stream Looks for spaces between data.
|
||||||
ocr Looks for lines in image based pdfs.
|
ocrl Lattice, but for images.
|
||||||
|
ocrs Stream, but for images.
|
||||||
|
|
||||||
See 'camelot <method> -h' for more information on a specific method.
|
See 'camelot <method> -h' for more information on a specific method.
|
||||||
"""
|
"""
|
||||||
|
|
@ -63,20 +64,22 @@ lattice_doc = """
|
||||||
Lattice method looks for lines between text to form a table.
|
Lattice method looks for lines between text to form a table.
|
||||||
|
|
||||||
usage:
|
usage:
|
||||||
camelot lattice [-t <tarea>...] [-F <fill>...] [-H <header>...]
|
camelot lattice [-t <tarea>...] [-F <fill>...] [-m <mtol>...]
|
||||||
[-m <mtol>...] [options] [--] <file>
|
[-j <jtol>...] [options] [--] <file>
|
||||||
|
|
||||||
options:
|
options:
|
||||||
-t, --tarea <tarea> Specific table areas to analyze.
|
-t, --tarea <tarea> Specific table areas to analyze.
|
||||||
-F, --fill <fill> Fill data in horizontal and/or vertical spanning
|
-F, --fill <fill> Fill data in horizontal and/or vertical spanning
|
||||||
cells. Example: -F h, -F v, -F hv
|
cells. Example: -F h, -F v, -F hv
|
||||||
-H, --header <header> Specify header for each table.
|
|
||||||
-m, --mtol <mtol> Tolerance to account for when merging lines
|
-m, --mtol <mtol> Tolerance to account for when merging lines
|
||||||
which are very close. [default: 2]
|
which are very close. [default: 2]
|
||||||
|
-j, --jtol <jtol> Tolerance to account for when matching line endings
|
||||||
|
with intersections. [default: 2]
|
||||||
-b, --blocksize <blocksize> See adaptive threshold doc. [default: 15]
|
-b, --blocksize <blocksize> See adaptive threshold doc. [default: 15]
|
||||||
-c, --constant <constant> See adaptive threshold doc. [default: -2]
|
-C, --constant <constant> See adaptive threshold doc. [default: -2]
|
||||||
-s, --scale <scale> Scaling factor. Large scaling factor leads to
|
-s, --scale <scale> Scaling factor. Large scaling factor leads to
|
||||||
smaller lines being detected. [default: 15]
|
smaller lines being detected. [default: 15]
|
||||||
|
-I, --iterations <iterations> Number of iterations for dilation. [default: 2]
|
||||||
-i, --invert Invert pdf image to make sure that lines are
|
-i, --invert Invert pdf image to make sure that lines are
|
||||||
in foreground.
|
in foreground.
|
||||||
-T, --shift_text <shift_text> Specify where the text in a spanning cell
|
-T, --shift_text <shift_text> Specify where the text in a spanning cell
|
||||||
|
|
@ -89,41 +92,61 @@ stream_doc = """
|
||||||
Stream method looks for whitespaces between text to form a table.
|
Stream method looks for whitespaces between text to form a table.
|
||||||
|
|
||||||
usage:
|
usage:
|
||||||
camelot stream [-t <tarea>...] [-c <columns>...] [-H <header>...]
|
camelot stream [-t <tarea>...] [-c <columns>...] [-m <mtol>...]
|
||||||
[-y <ytol>...] [-m <mtol>...] [options] [--] <file>
|
[-y <ytol>...] [options] [--] <file>
|
||||||
|
|
||||||
options:
|
options:
|
||||||
-t, --tarea <tarea> Specific table areas to analyze.
|
-t, --tarea <tarea> Specific table areas to analyze.
|
||||||
-c, --columns <columns> Comma-separated list of column x-coordinates.
|
-c, --columns <columns> Comma-separated list of column x-coordinates.
|
||||||
Example: -c 10.1,20.2,30.3
|
Example: -c 10.1,20.2,30.3
|
||||||
-H, --header <header> Specify header for each table.
|
|
||||||
-y, --ytol <ytol> Tolerance to account for when grouping rows
|
|
||||||
together. [default: 2]
|
|
||||||
-m, --mtol <mtol> Tolerance to account for when merging columns
|
-m, --mtol <mtol> Tolerance to account for when merging columns
|
||||||
together. [default: 0]
|
together. [default: 0]
|
||||||
|
-y, --ytol <ytol> Tolerance to account for when grouping rows
|
||||||
|
together. [default: 2]
|
||||||
-d, --debug Debug by visualizing textboxes.
|
-d, --debug Debug by visualizing textboxes.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
ocr_doc = """
|
ocrl_doc = """
|
||||||
OCR method looks for lines in image based pdfs.
|
Lattice, but for images.
|
||||||
|
|
||||||
usage:
|
usage:
|
||||||
camelot ocr [-t <tarea>] [-m <mtol>] [options] [--] <file>
|
camelot ocrl [-t <tarea>...] [-m <mtol>...] [options] [--] <file>
|
||||||
|
|
||||||
options:
|
options:
|
||||||
-t, --tarea <tarea> Specific table areas to analyze.
|
-t, --tarea <tarea> Specific table areas to analyze.
|
||||||
-m, --mtol <mtol> Tolerance to account for when merging lines
|
-m, --mtol <mtol> Tolerance to account for when merging lines
|
||||||
which are very close. [default: 2]
|
which are very close. [default: 2]
|
||||||
-b, --blocksize <blocksize> See adaptive threshold doc. [default: 15]
|
-b, --blocksize <blocksize> See adaptive threshold doc. [default: 15]
|
||||||
-c, --constant <constant> See adaptive threshold doc. [default: -2]
|
-C, --constant <constant> See adaptive threshold doc. [default: -2]
|
||||||
-D, --dpi <dpi> Dots per inch, specify image quality to be used for OCR.
|
-D, --dpi <dpi> Dots per inch, specify image quality to be used for OCR.
|
||||||
[default: 300]
|
[default: 300]
|
||||||
-l, --lang <lang> Specify language to be used for OCR. [default: eng]
|
-l, --lang <lang> Specify language to be used for OCR. [default: eng]
|
||||||
-s, --scale <scale> Scaling factor. Large scaling factor leads to
|
-s, --scale <scale> Scaling factor. Large scaling factor leads to
|
||||||
smaller lines being detected. [default: 15]
|
smaller lines being detected. [default: 15]
|
||||||
-d, --debug <debug> Debug by visualizing pdf geometry.
|
-I, --iterations <iterations> Number of iterations for dilation. [default: 2]
|
||||||
(contour,line,joint,table) Example: -d table
|
-d, --debug <debug> Debug by visualizing pdf geometry.
|
||||||
|
(contour,line,joint,table) Example: -d table
|
||||||
|
"""
|
||||||
|
|
||||||
|
ocrs_doc = """
|
||||||
|
Stream, but for images.
|
||||||
|
|
||||||
|
usage:
|
||||||
|
camelot ocrs [-t <tarea>...] [-c <columns>...] [options] [--] <file>
|
||||||
|
|
||||||
|
options:
|
||||||
|
-t, --tarea <tarea> Specific table areas to analyze.
|
||||||
|
-c, --columns <columns> Comma-separated list of column x-coordinates.
|
||||||
|
Example: -c 10.1,20.2,30.3
|
||||||
|
-b, --blocksize <blocksize> See adaptive threshold doc. [default: 15]
|
||||||
|
-C, --constant <constant> See adaptive threshold doc. [default: -2]
|
||||||
|
-N, --line-threshold <line_threshold> Maximum intensity of projections on y-axis.
|
||||||
|
[default: 100]
|
||||||
|
-D, --dpi <dpi> Dots per inch, specify image quality to be used for OCR.
|
||||||
|
[default: 300]
|
||||||
|
-l, --lang <lang> Specify language to be used for OCR. [default: eng]
|
||||||
|
-d, --debug Debug by visualizing image.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -351,8 +374,10 @@ if __name__ == '__main__':
|
||||||
args.update(docopt(lattice_doc, argv=argv))
|
args.update(docopt(lattice_doc, argv=argv))
|
||||||
elif args['<method>'] == 'stream':
|
elif args['<method>'] == 'stream':
|
||||||
args.update(docopt(stream_doc, argv=argv))
|
args.update(docopt(stream_doc, argv=argv))
|
||||||
elif args['<method>'] == 'ocr':
|
elif args['<method>'] == 'ocrl':
|
||||||
args.update(docopt(ocr_doc, argv=argv))
|
args.update(docopt(ocrl_doc, argv=argv))
|
||||||
|
elif args['<method>'] == 'ocrs':
|
||||||
|
args.update(docopt(ocrs_doc, argv=argv))
|
||||||
|
|
||||||
filename = args['<file>']
|
filename = args['<file>']
|
||||||
filedir = os.path.dirname(args['<file>'])
|
filedir = os.path.dirname(args['<file>'])
|
||||||
|
|
@ -392,11 +417,12 @@ if __name__ == '__main__':
|
||||||
kwargs = {
|
kwargs = {
|
||||||
'table_area': args['--tarea'] if args['--tarea'] else None,
|
'table_area': args['--tarea'] if args['--tarea'] else None,
|
||||||
'fill': args['--fill'] if args['--fill'] else None,
|
'fill': args['--fill'] if args['--fill'] else None,
|
||||||
'headers': args['--header'] if args['--header'] else None,
|
|
||||||
'mtol': [int(m) for m in args['--mtol']],
|
'mtol': [int(m) for m in args['--mtol']],
|
||||||
|
'jtol': [int(j) for j in args['--jtol']],
|
||||||
'blocksize': int(args['--blocksize']),
|
'blocksize': int(args['--blocksize']),
|
||||||
'threshold_constant': float(args['--constant']),
|
'threshold_constant': float(args['--constant']),
|
||||||
'scale': int(args['--scale']),
|
'scale': int(args['--scale']),
|
||||||
|
'iterations': int(args['--iterations']),
|
||||||
'invert': args['--invert'],
|
'invert': args['--invert'],
|
||||||
'margins': margins,
|
'margins': margins,
|
||||||
'split_text': args['--split_text'],
|
'split_text': args['--split_text'],
|
||||||
|
|
@ -462,7 +488,6 @@ if __name__ == '__main__':
|
||||||
kwargs = {
|
kwargs = {
|
||||||
'table_area': args['--tarea'] if args['--tarea'] else None,
|
'table_area': args['--tarea'] if args['--tarea'] else None,
|
||||||
'columns': args['--columns'] if args['--columns'] else None,
|
'columns': args['--columns'] if args['--columns'] else None,
|
||||||
'headers': args['--header'] if args['--header'] else None,
|
|
||||||
'ytol': [int(y) for y in args['--ytol']],
|
'ytol': [int(y) for y in args['--ytol']],
|
||||||
'mtol': [int(m) for m in args['--mtol']],
|
'mtol': [int(m) for m in args['--mtol']],
|
||||||
'margins': margins,
|
'margins': margins,
|
||||||
|
|
@ -522,7 +547,7 @@ if __name__ == '__main__':
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.exception(e.message, exc_info=True)
|
logger.exception(e.message, exc_info=True)
|
||||||
sys.exit()
|
sys.exit()
|
||||||
elif args['<method>'] == 'ocr':
|
elif args['<method>'] == 'ocrl':
|
||||||
try:
|
try:
|
||||||
kwargs = {
|
kwargs = {
|
||||||
'table_area': args['--tarea'] if args['--tarea'] else None,
|
'table_area': args['--tarea'] if args['--tarea'] else None,
|
||||||
|
|
@ -532,9 +557,75 @@ if __name__ == '__main__':
|
||||||
'dpi': int(args['--dpi']),
|
'dpi': int(args['--dpi']),
|
||||||
'lang': args['--lang'],
|
'lang': args['--lang'],
|
||||||
'scale': int(args['--scale']),
|
'scale': int(args['--scale']),
|
||||||
|
'iterations': int(args['--iterations']),
|
||||||
'debug': args['--debug']
|
'debug': args['--debug']
|
||||||
}
|
}
|
||||||
manager = Pdf(OCR(**kwargs), filename, pagenos=p, clean=True,
|
manager = Pdf(OCRLattice(**kwargs), filename, pagenos=p, clean=True,
|
||||||
|
parallel=args['--parallel'])
|
||||||
|
data = manager.extract()
|
||||||
|
|
||||||
|
processing_time = time.time() - start_time
|
||||||
|
logger.info("Finished processing in " + str(processing_time) + " seconds")
|
||||||
|
|
||||||
|
if args['--plot']:
|
||||||
|
if args['--output']:
|
||||||
|
pngname = os.path.join(args['--output'], os.path.basename(pngname))
|
||||||
|
plot_type = args['--plot'].split(',')
|
||||||
|
if 'page' in plot_type:
|
||||||
|
for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
|
||||||
|
page = data[page_number]
|
||||||
|
for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
|
||||||
|
table = page[table_number]
|
||||||
|
plot_table_barchart(table['r_nempty_cells'],
|
||||||
|
table['c_nempty_cells'],
|
||||||
|
table['empty_p'],
|
||||||
|
page_number,
|
||||||
|
table_number)
|
||||||
|
|
||||||
|
if 'all' in plot_type:
|
||||||
|
plot_all_barchart(data, pngname)
|
||||||
|
|
||||||
|
if 'rc' in plot_type:
|
||||||
|
plot_rc_piechart(data, pngname)
|
||||||
|
|
||||||
|
if args['--print-stats']:
|
||||||
|
print_stats(data, processing_time)
|
||||||
|
|
||||||
|
if args['--save-stats']:
|
||||||
|
if args['--output']:
|
||||||
|
scorename = os.path.join(args['--output'], os.path.basename(scorename))
|
||||||
|
with open(scorename, 'w') as score_file:
|
||||||
|
score_file.write('table,nrows,ncols,empty_p,line_p,text_p,score\n')
|
||||||
|
for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
|
||||||
|
page = data[page_number]
|
||||||
|
for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
|
||||||
|
table = page[table_number]
|
||||||
|
score_file.write('{0},{1},{2},{3},{4},{5},{6}\n'.format(
|
||||||
|
''.join([page_number, '_', table_number]),
|
||||||
|
table['nrows'],
|
||||||
|
table['ncols'],
|
||||||
|
table['empty_p'],
|
||||||
|
table['line_p'],
|
||||||
|
table['text_p'],
|
||||||
|
table['score']))
|
||||||
|
if args['--debug']:
|
||||||
|
manager.debug_plot()
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception(e.message, exc_info=True)
|
||||||
|
sys.exit()
|
||||||
|
elif args['<method>'] == 'ocrs':
|
||||||
|
try:
|
||||||
|
kwargs = {
|
||||||
|
'table_area': args['--tarea'] if args['--tarea'] else None,
|
||||||
|
'columns': args['--columns'] if args['--columns'] else None,
|
||||||
|
'blocksize': int(args['--blocksize']),
|
||||||
|
'threshold_constant': float(args['--constant']),
|
||||||
|
'line_threshold': int(args['--line-threshold']),
|
||||||
|
'dpi': int(args['--dpi']),
|
||||||
|
'lang': args['--lang'],
|
||||||
|
'debug': args['--debug']
|
||||||
|
}
|
||||||
|
manager = Pdf(OCRStream(**kwargs), filename, pagenos=p, clean=True,
|
||||||
parallel=args['--parallel'])
|
parallel=args['--parallel'])
|
||||||
data = manager.extract()
|
data = manager.extract()
|
||||||
|
|
||||||
|
|
@ -588,7 +679,7 @@ if __name__ == '__main__':
|
||||||
logger.exception(e.message, exc_info=True)
|
logger.exception(e.message, exc_info=True)
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
|
||||||
if args['--debug']:
|
if args.get('--debug') is not None and args['--debug']:
|
||||||
print("See 'camelot <method> -h' for various parameters you can tweak.")
|
print("See 'camelot <method> -h' for various parameters you can tweak.")
|
||||||
else:
|
else:
|
||||||
output = filedir if args['--output'] is None else args['--output']
|
output = filedir if args['--output'] is None else args['--output']
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue