[ENH] Add OCR and better joint detection

* Add iterations for dilation

* Add OCRLattice and OCRStream

* Add debug
pull/2/head
Vinayak Mehta 2017-04-18 18:25:47 +05:30 committed by GitHub
parent dd909e2b53
commit 4da754ddcb
8 changed files with 411 additions and 156 deletions

View File

@ -1,3 +1,6 @@
from itertools import groupby
from operator import itemgetter
import cv2
import numpy as np
@ -44,7 +47,7 @@ def adaptive_threshold(imagename, invert=False, blocksize=15, c=-2):
return img, threshold
def find_lines(threshold, direction='horizontal', scale=15):
def find_lines(threshold, direction='horizontal', scale=15, iterations=2):
"""Finds horizontal and vertical lines by applying morphological
transformations on an image.
@ -62,6 +65,10 @@ def find_lines(threshold, direction='horizontal', scale=15):
for morph transform.
(optional, default: 15)
iterations : int
Number of iterations for dilation.
(optional, default: 2)
Returns
-------
dmask : object
@ -85,10 +92,9 @@ def find_lines(threshold, direction='horizontal', scale=15):
raise ValueError("Specify direction as either 'vertical' or"
" 'horizontal'")
threshold = cv2.erode(threshold, el, (-1, -1))
threshold = cv2.dilate(threshold, el, (-1, -1))
dmask = threshold # findContours modifies source image
threshold = cv2.erode(threshold, el)
threshold = cv2.dilate(threshold, el)
dmask = cv2.dilate(threshold, el, iterations=iterations)
try:
_, contours, _ = cv2.findContours(
@ -190,4 +196,33 @@ def find_table_joints(contours, vertical, horizontal):
joint_coords.append((c1, c2))
tables[(x, y + h, x + w, y)] = joint_coords
return tables
return tables
def find_cuts(threshold, line_threshold=100):
"""find_cuts
Parameters
----------
threshold : object
numpy.ndarray representing the thresholded image.
line_threshold : int
Maximum intensity of projections on y-axis.
(optional, default: 100)
Returns
-------
y_cuts : list
List of cuts on y-axis.
"""
y_proj = np.sum(threshold, axis=1)
y_proj_less = np.where(y_proj < line_threshold)[0]
ranges = []
for k, g in groupby(enumerate(y_proj_less), lambda (i, x): i-x):
group = map(itemgetter(1), g)
ranges.append((group[0], group[-1]))
y_cuts = []
for r in ranges:
y_cuts.append((r[0] + r[1]) / 2)
return sorted(y_cuts, reverse=True)

View File

@ -12,7 +12,7 @@ from .imgproc import (adaptive_threshold, find_lines, find_table_contours,
from .table import Table
from .utils import (scale_to_pdf, scale_to_image, segments_bbox, text_in_bbox,
merge_close_values, get_table_index, get_score, count_empty,
encode_list, get_text_objects, get_page_layout)
encode_list, get_text_objects, get_page_layout, remove_empty)
__all__ = ['Lattice']
@ -131,20 +131,20 @@ class Lattice:
direction.
(optional, default: None)
headers : list
List of strings where each string is a csv header for a table.
(optional, default: None)
mtol : list
List of ints specifying m-tolerance parameters.
(optional, default: [2])
blocksize: int
jtol : list
List of ints specifying j-tolerance parameters.
(optional, default: [2])
blocksize : int
Size of a pixel neighborhood that is used to calculate a
threshold value for the pixel: 3, 5, 7, and so on.
(optional, default: 15)
threshold_constant: float
threshold_constant : float
Constant subtracted from the mean or weighted mean
(see the details below). Normally, it is positive but may be
zero or negative as well.
@ -155,6 +155,10 @@ class Lattice:
element for image processing.
(optional, default: 15)
iterations : int
Number of iterations for dilation.
(optional, default: 2)
invert : bool
Whether or not to invert the image. Useful when pdfs have
tables with lines in background.
@ -187,19 +191,20 @@ class Lattice:
of detected contours, lines, joints and the table generated.
(optional, default: None)
"""
def __init__(self, table_area=None, fill=None, headers=None, mtol=[2],
blocksize=15, threshold_constant=-2, scale=15, invert=False,
margins=(1.0, 0.5, 0.1), split_text=False, flag_size=True,
shift_text=['l', 't'], debug=None):
def __init__(self, table_area=None, fill=None, mtol=[2], jtol=[2],
blocksize=15, threshold_constant=-2, scale=15, iterations=2,
invert=False, margins=(1.0, 0.5, 0.1), split_text=False,
flag_size=True, shift_text=['l', 't'], debug=None):
self.method = 'lattice'
self.table_area = table_area
self.fill = fill
self.headers = headers
self.mtol = mtol
self.jtol = jtol
self.blocksize = blocksize
self.threshold_constant = threshold_constant
self.scale = scale
self.iterations = iterations
self.invert = invert
self.char_margin, self.line_margin, self.word_margin = margins
self.split_text = split_text
@ -257,17 +262,14 @@ class Lattice:
factors_pdf = (sc_x_pdf, sc_y_pdf, img_y)
vmask, v_segments = find_lines(threshold, direction='vertical',
scale=self.scale)
scale=self.scale, iterations=self.iterations)
hmask, h_segments = find_lines(threshold, direction='horizontal',
scale=self.scale)
scale=self.scale, iterations=self.iterations)
if self.table_area is not None:
if self.fill is not None:
if len(self.table_area) != len(self.fill):
raise ValueError("Length of fill should be equal to table_area.")
if self.headers is not None:
if len(self.table_area) != len(self.headers):
raise ValueError("Length of headers should be equal to table_area.")
raise ValueError("Length of table area and fill should be equal.")
areas = []
for area in self.table_area:
@ -288,6 +290,11 @@ class Lattice:
else:
mtolerance = copy.deepcopy(self.mtol)
if len(self.jtol) == 1 and self.jtol[0] == 2:
jtolerance = copy.deepcopy(self.jtol) * len(table_bbox)
else:
jtolerance = copy.deepcopy(self.jtol)
if self.debug:
self.debug_images = (img, table_bbox)
@ -326,18 +333,9 @@ class Lattice:
rows = [(rows[i], rows[i + 1])
for i in range(0, len(rows) - 1)]
if self.headers is not None and self.headers[table_no] != [""]:
self.headers[table_no] = self.headers[table_no].split(',')
if len(self.headers[table_no]) != len(cols):
logger.warning("Length of header ({0}) specified for table is not"
" equal to the number of columns ({1}) detected.".format(
len(self.headers[table_no]), len(cols)))
while len(self.headers[table_no]) != len(cols):
self.headers[table_no].append('')
table = Table(cols, rows)
# set table edges to True using ver+hor lines
table = table.set_edges(v_s, h_s)
table = table.set_edges(v_s, h_s, jtol=jtolerance[table_no])
nouse = table.nocont_ / (len(v_s) + len(h_s))
table_data['line_p'] = 100 * (1 - nouse)
# set spanning cells to True
@ -351,27 +349,27 @@ class Lattice:
assignment_errors = []
table_data['split_text'] = []
table_data['superscript'] = []
for direction in t_bbox:
for direction in ['vertical', 'horizontal']:
for t in t_bbox[direction]:
indices, error = get_table_index(
table, t, direction, split_text=self.split_text,
flag_size=self.flag_size)
assignment_errors.append(error)
indices = _reduce_index(table, indices, shift_text=self.shift_text,)
if len(indices) > 1:
table_data['split_text'].append(indices)
for r_idx, c_idx, text in indices:
if all(s in text for s in ['<s>', '</s>']):
table_data['superscript'].append((r_idx, c_idx, text))
table.cells[r_idx][c_idx].add_text(text)
if indices[:2] != (-1, -1):
assignment_errors.append(error)
indices = _reduce_index(table, indices, shift_text=self.shift_text)
if len(indices) > 1:
table_data['split_text'].append(indices)
for r_idx, c_idx, text in indices:
if all(s in text for s in ['<s>', '</s>']):
table_data['superscript'].append((r_idx, c_idx, text))
table.cells[r_idx][c_idx].add_text(text)
score = get_score([[100, assignment_errors]])
table_data['score'] = score
if self.fill is not None:
table = _fill_spanning(table, fill=self.fill[table_no])
ar = table.get_list()
if self.headers is not None and self.headers[table_no] != ['']:
ar.insert(0, self.headers[table_no])
ar = remove_empty(ar)
ar = encode_list(ar)
table_data['data'] = ar
empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)

View File

@ -7,19 +7,18 @@ from PIL import Image
from .table import Table
from .imgproc import (adaptive_threshold, find_lines, find_table_contours,
find_table_joints)
from .utils import merge_close_values, encode_list
find_table_joints, find_cuts)
from .utils import merge_close_values, encode_list, remove_empty
class OCR:
"""Uses optical character recognition to get text out of image based pdfs.
Currently works only on pdfs with lines.
class OCRLattice:
"""Lattice, but for images.
Parameters
----------
table_area : list
List of strings of the form x1,y1,x2,y2 where
(x1, y1) -> left-top and (x2, y2) -> right-bottom in PDFMiner's
(x1, y1) -> left-top and (x2, y2) -> right-bottom in OpenCV's
coordinate space, denoting table areas to analyze.
(optional, default: None)
@ -27,12 +26,12 @@ class OCR:
List of ints specifying m-tolerance parameters.
(optional, default: [2])
blocksize: int
blocksize : int
Size of a pixel neighborhood that is used to calculate a
threshold value for the pixel: 3, 5, 7, and so on.
(optional, default: 15)
threshold_constant: float
threshold_constant : float
Constant subtracted from the mean or weighted mean
(see the details below). Normally, it is positive but may be
zero or negative as well.
@ -51,6 +50,10 @@ class OCR:
element for image processing.
(optional, default: 15)
iterations : int
Number of iterations for dilation.
(optional, default: 2)
debug : string
{'contour', 'line', 'joint', 'table'}
Set to one of the above values to generate a matplotlib plot
@ -58,9 +61,9 @@ class OCR:
(optional, default: None)
"""
def __init__(self, table_area=None, mtol=[2], blocksize=15, threshold_constant=-2,
dpi=300, lang="eng", scale=15, debug=None):
dpi=300, lang="eng", scale=15, iterations=2, debug=None):
self.method = 'ocr'
self.method = 'ocrl'
self.table_area = table_area
self.mtol = mtol
self.blocksize = blocksize
@ -69,11 +72,13 @@ class OCR:
self.dpi = dpi
self.lang = lang
self.scale = scale
self.iterations = iterations
self.debug = debug
def get_tables(self, pdfname):
if self.tool is None:
return None
bname, __ = os.path.splitext(pdfname)
imagename = ''.join([bname, '.png'])
@ -91,9 +96,9 @@ class OCR:
img, threshold = adaptive_threshold(imagename, blocksize=self.blocksize,
c=self.threshold_constant)
vmask, v_segments = find_lines(threshold, direction='vertical',
scale=self.scale)
scale=self.scale, iterations=self.iterations)
hmask, h_segments = find_lines(threshold, direction='horizontal',
scale=self.scale)
scale=self.scale, iterations=self.iterations)
if self.table_area is not None:
areas = []
@ -154,6 +159,7 @@ class OCR:
ar = table.get_list()
ar.reverse()
ar = encode_list(ar)
ar = remove_empty(ar)
table_data['data'] = ar
tables['table-{0}'.format(table_no + 1)] = table_data
table_no += 1
@ -162,4 +168,142 @@ class OCR:
if self.debug:
return None
return page
class OCRStream:
"""Stream, but for images.
Parameters
----------
table_area : list
List of strings of the form x1,y1,x2,y2 where
(x1, y1) -> left-top and (x2, y2) -> right-bottom in OpenCV's
coordinate space, denoting table areas to analyze.
(optional, default: None)
columns : list
List of strings where each string is comma-separated values of
x-coordinates in OpenCV's coordinate space.
(optional, default: None)
blocksize : int
Size of a pixel neighborhood that is used to calculate a
threshold value for the pixel: 3, 5, 7, and so on.
(optional, default: 15)
threshold_constant : float
Constant subtracted from the mean or weighted mean
(see the details below). Normally, it is positive but may be
zero or negative as well.
(optional, default: -2)
line_threshold : int
Maximum intensity of projections on y-axis.
(optional, default: 100)
dpi : int
Dots per inch.
(optional, default: 300)
lang : string
Language to be used for OCR.
(optional, default: 'eng')
"""
def __init__(self, table_area=None, columns=None, blocksize=15,
threshold_constant=-2, line_threshold=100, dpi=300, lang="eng",
debug=False):
self.method = 'ocrs'
self.table_area = table_area
self.columns = columns
self.blocksize = blocksize
self.threshold_constant = threshold_constant
self.line_threshold = line_threshold
self.tool = pyocr.get_available_tools()[0] # fix this
self.dpi = dpi
self.lang = lang
self.debug = debug
def get_tables(self, pdfname):
if self.tool is None:
return None
bname, __ = os.path.splitext(pdfname)
imagename = ''.join([bname, '.png'])
gs_call = [
"-q", "-sDEVICE=png16m", "-o", imagename, "-r{0}".format(self.dpi),
pdfname
]
if "ghostscript" in subprocess.check_output(["gs", "-version"]).lower():
gs_call.insert(0, "gs")
else:
gs_call.insert(0, "gsc")
subprocess.call(gs_call, stdout=open(os.devnull, 'w'),
stderr=subprocess.STDOUT)
img, threshold = adaptive_threshold(imagename, blocksize=self.blocksize,
c=self.threshold_constant)
height, width = threshold.shape
if self.debug:
self.debug_images = img
return None
if self.table_area is not None:
if self.columns is not None:
if len(self.table_area) != len(self.columns):
raise ValueError("Length of table area and columns should be equal.")
table_bbox = {}
for area in self.table_area:
x1, y1, x2, y2 = area.split(",")
x1 = int(x1)
y1 = int(y1)
x2 = int(x2)
y2 = int(y2)
table_bbox[(x1, y1, x2, y2)] = None
else:
table_bbox = {(0, 0, width, height): None}
page = {}
tables = {}
table_no = 0
for k in sorted(table_bbox.keys(), key=lambda x: x[1]):
if self.columns is None:
raise NotImplementedError
else:
table_data = {}
table_image = threshold[k[1]:k[3],k[0]:k[2]]
cols = self.columns[table_no].split(',')
cols = [float(c) for c in cols]
cols.insert(0, k[0])
cols.append(k[2])
cols = [(cols[i] - k[0], cols[i + 1] - k[0]) for i in range(0, len(cols) - 1)]
y_cuts = find_cuts(table_image, line_threshold=self.line_threshold)
rows = [(y_cuts[i], y_cuts[i + 1]) for i in range(0, len(y_cuts) - 1)]
table = Table(cols, rows)
for i in range(len(table.cells)):
for j in range(len(table.cells[i])):
x1 = int(table.cells[i][j].x1)
y1 = int(table.cells[i][j].y1)
x2 = int(table.cells[i][j].x2)
y2 = int(table.cells[i][j].y2)
table.cells[i][j].image = table_image[y1:y2,x1:x2]
cell_image = Image.fromarray(table.cells[i][j].image)
text = self.tool.image_to_string(
cell_image,
lang=self.lang,
builder=pyocr.builders.TextBuilder()
)
table.cells[i][j].add_text(text)
ar = table.get_list()
ar.reverse()
ar = encode_list(ar)
ar = remove_empty(ar)
table_data['data'] = ar
tables['table-{0}'.format(table_no + 1)] = table_data
table_no += 1
page[os.path.basename(bname)] = tables
return page

View File

@ -141,11 +141,14 @@ class Pdf:
if self.extractor.method == 'stream':
self.debug = self.extractor.debug
self.debug_text = []
elif self.extractor.method in ['lattice', 'ocr']:
elif self.extractor.method in ['lattice', 'ocrl']:
self.debug = self.extractor.debug
self.debug_images = []
self.debug_segments = []
self.debug_tables = []
elif self.extractor.method == 'ocrs':
self.debug = self.extractor.debug
self.debug_images = []
for p in pages:
table = self.extractor.get_tables(p)
if table is not None:
@ -157,6 +160,8 @@ class Pdf:
self.debug_images.append(self.extractor.debug_images)
self.debug_segments.append(self.extractor.debug_segments)
self.debug_tables.append(self.extractor.debug_tables)
elif self.extractor.method == 'ocrs':
self.debug_images.append(self.extractor.debug_images)
if self.clean:
self.remove_tempdir()
return tables
@ -175,7 +180,7 @@ class Pdf:
import matplotlib.patches as patches
if self.debug is True:
try:
if hasattr(self, 'debug_text'):
for text in self.debug_text:
fig = plt.figure()
ax = fig.add_subplot(111, aspect='equal')
@ -193,8 +198,10 @@ class Pdf:
ax.set_xlim(min(xs) - 10, max(xs) + 10)
ax.set_ylim(min(ys) - 10, max(ys) + 10)
plt.show()
except AttributeError:
raise ValueError("This option only be used with Stream.")
elif hasattr(self, 'debug_images'):
for img in self.debug_images:
plt.imshow(img)
plt.show()
elif self.debug == 'contour':
try:
for img, table_bbox in self.debug_images:

View File

@ -236,10 +236,6 @@ class Stream:
x-coordinates in PDFMiner's coordinate space.
(optional, default: None)
headers : list
List of strings where each string is a csv header for a table.
(optional, default: None)
ytol : list
List of ints specifying the y-tolerance parameters.
(optional, default: [2])
@ -268,14 +264,13 @@ class Stream:
LTTextLineHorizontals in order to select table_area, columns.
(optional, default: False)
"""
def __init__(self, table_area=None, columns=None, headers=None,
ytol=[2], mtol=[0], margins=(1.0, 0.5, 0.1),
split_text=False, flag_size=True, debug=False):
def __init__(self, table_area=None, columns=None, ytol=[2], mtol=[0],
margins=(1.0, 0.5, 0.1), split_text=False, flag_size=True,
debug=False):
self.method = 'stream'
self.table_area = table_area
self.columns = columns
self.headers = headers
self.ytol = ytol
self.mtol = mtol
self.char_margin, self.line_margin, self.word_margin = margins
@ -312,14 +307,12 @@ class Stream:
self.debug_text = []
self.debug_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlh])
self.debug_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlv])
return None
if self.table_area is not None:
if self.columns is not None:
if len(self.table_area) != len(self.columns):
raise ValueError("Length of columns should be equal to table_area.")
if self.headers is not None:
if len(self.table_area) != len(self.headers):
raise ValueError("Length of headers should be equal to table_area.")
raise ValueError("Length of table area and columns should be equal.")
table_bbox = {}
for area in self.table_area:
@ -336,6 +329,7 @@ class Stream:
ytolerance = copy.deepcopy(self.ytol) * len(table_bbox)
else:
ytolerance = copy.deepcopy(self.ytol)
if len(self.mtol) == 1 and self.mtol[0] == 0:
mtolerance = copy.deepcopy(self.mtol) * len(table_bbox)
else:
@ -374,7 +368,7 @@ class Stream:
guess = True
ncols = max(set(elements), key=elements.count)
len_non_mode = len(filter(lambda x: x != ncols, elements))
if ncols == 1 and not self.debug:
if ncols == 1:
# no tables detected
logger.warning("{}: Only one column was detected, the pdf"
" may have no tables.".format(
@ -396,15 +390,6 @@ class Stream:
cols = _add_columns(cols, inner_text, ytolerance[table_no])
cols = _join_columns(cols, text_x_min, text_x_max)
if self.headers is not None and self.headers[table_no] != [""]:
self.headers[table_no] = self.headers[table_no].split(',')
if len(self.headers[table_no]) != len(cols):
logger.warning("Length of header ({0}) specified for table is not"
" equal to the number of columns ({1}) detected.".format(
len(self.headers[table_no]), len(cols)))
while len(self.headers[table_no]) != len(cols):
self.headers[table_no].append('')
table = Table(cols, rows)
table = table.set_all_edges()
assignment_errors = []
@ -429,8 +414,6 @@ class Stream:
table_data['score'] = score
ar = table.get_list()
if self.headers is not None and self.headers[table_no] != ['']:
ar.insert(0, self.headers[table_no])
ar = encode_list(ar)
table_data['data'] = ar
empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)

View File

@ -188,38 +188,32 @@ class Table:
bound = self.cells[r][c].get_bounded_edges()
if bound == 4:
continue
elif bound == 3:
if not self.cells[r][c].left:
if (self.cells[r][c].right and
self.cells[r][c].top and
self.cells[r][c].bottom):
self.cells[r][c].spanning_h = True
elif not self.cells[r][c].right:
if (self.cells[r][c].left and
self.cells[r][c].top and
self.cells[r][c].bottom):
self.cells[r][c].spanning_h = True
elif not self.cells[r][c].top:
if (self.cells[r][c].left and
self.cells[r][c].right and
self.cells[r][c].bottom):
self.cells[r][c].spanning_v = True
elif not self.cells[r][c].bottom:
if (self.cells[r][c].left and
self.cells[r][c].right and
self.cells[r][c].top):
self.cells[r][c].spanning_v = True
elif bound == 2:
if self.cells[r][c].left and self.cells[r][c].right:
if (not self.cells[r][c].top and
not self.cells[r][c].bottom):
self.cells[r][c].spanning_v = True
elif self.cells[r][c].top and self.cells[r][c].bottom:
if (not self.cells[r][c].left and
not self.cells[r][c].right):

View File

@ -426,40 +426,43 @@ def split_textline(table, textline, direction, flag_size=True):
idx = 0
cut_text = []
bbox = textline.bbox
if direction == 'horizontal' and not textline.is_empty():
x_overlap = [i for i, x in enumerate(table.cols) if x[0] <= bbox[2] and bbox[0] <= x[1]]
r_idx = [j for j, r in enumerate(table.rows) if r[1] <= (bbox[1] + bbox[3]) / 2 <= r[0]]
r = r_idx[0]
x_cuts = [(c, table.cells[r][c].x2) for c in x_overlap if table.cells[r][c].right]
if not x_cuts:
x_cuts = [(x_overlap[0], table.cells[r][-1].x2)]
for obj in textline._objs:
row = table.rows[r]
for cut in x_cuts:
if isinstance(obj, LTChar):
if (row[1] <= (obj.y0 + obj.y1) / 2 <= row[0] and
(obj.x0 + obj.x1) / 2 <= cut[1]):
try:
if direction == 'horizontal' and not textline.is_empty():
x_overlap = [i for i, x in enumerate(table.cols) if x[0] <= bbox[2] and bbox[0] <= x[1]]
r_idx = [j for j, r in enumerate(table.rows) if r[1] <= (bbox[1] + bbox[3]) / 2 <= r[0]]
r = r_idx[0]
x_cuts = [(c, table.cells[r][c].x2) for c in x_overlap if table.cells[r][c].right]
if not x_cuts:
x_cuts = [(x_overlap[0], table.cells[r][-1].x2)]
for obj in textline._objs:
row = table.rows[r]
for cut in x_cuts:
if isinstance(obj, LTChar):
if (row[1] <= (obj.y0 + obj.y1) / 2 <= row[0] and
(obj.x0 + obj.x1) / 2 <= cut[1]):
cut_text.append((r, cut[0], obj))
break
elif isinstance(obj, LTAnno):
cut_text.append((r, cut[0], obj))
break
elif isinstance(obj, LTAnno):
cut_text.append((r, cut[0], obj))
elif direction == 'vertical' and not textline.is_empty():
y_overlap = [j for j, y in enumerate(table.rows) if y[1] <= bbox[3] and bbox[1] <= y[0]]
c_idx = [i for i, c in enumerate(table.cols) if c[0] <= (bbox[0] + bbox[2]) / 2 <= c[1]]
c = c_idx[0]
y_cuts = [(r, table.cells[r][c].y1) for r in y_overlap if table.cells[r][c].bottom]
if not y_cuts:
y_cuts = [(y_overlap[0], table.cells[-1][c].y1)]
for obj in textline._objs:
col = table.cols[c]
for cut in y_cuts:
if isinstance(obj, LTChar):
if (col[0] <= (obj.x0 + obj.x1) / 2 <= col[1] and
(obj.y0 + obj.y1) / 2 >= cut[1]):
elif direction == 'vertical' and not textline.is_empty():
y_overlap = [j for j, y in enumerate(table.rows) if y[1] <= bbox[3] and bbox[1] <= y[0]]
c_idx = [i for i, c in enumerate(table.cols) if c[0] <= (bbox[0] + bbox[2]) / 2 <= c[1]]
c = c_idx[0]
y_cuts = [(r, table.cells[r][c].y1) for r in y_overlap if table.cells[r][c].bottom]
if not y_cuts:
y_cuts = [(y_overlap[0], table.cells[-1][c].y1)]
for obj in textline._objs:
col = table.cols[c]
for cut in y_cuts:
if isinstance(obj, LTChar):
if (col[0] <= (obj.x0 + obj.x1) / 2 <= col[1] and
(obj.y0 + obj.y1) / 2 >= cut[1]):
cut_text.append((cut[0], c, obj))
break
elif isinstance(obj, LTAnno):
cut_text.append((cut[0], c, obj))
break
elif isinstance(obj, LTAnno):
cut_text.append((cut[0], c, obj))
except IndexError:
return [(-1, -1, textline.get_text())]
grouped_chars = []
for key, chars in groupby(cut_text, itemgetter(0, 1)):
if flag_size:

View File

@ -18,7 +18,7 @@ from PyPDF2 import PdfFileReader
from camelot.pdf import Pdf
from camelot.lattice import Lattice
from camelot.stream import Stream
from camelot.ocr import OCR
from camelot.ocr import OCRLattice, OCRStream
from camelot import utils
@ -54,7 +54,8 @@ options:
camelot methods:
lattice Looks for lines between data.
stream Looks for spaces between data.
ocr Looks for lines in image based pdfs.
ocrl Lattice, but for images.
ocrs Stream, but for images.
See 'camelot <method> -h' for more information on a specific method.
"""
@ -63,20 +64,22 @@ lattice_doc = """
Lattice method looks for lines between text to form a table.
usage:
camelot lattice [-t <tarea>...] [-F <fill>...] [-H <header>...]
[-m <mtol>...] [options] [--] <file>
camelot lattice [-t <tarea>...] [-F <fill>...] [-m <mtol>...]
[-j <jtol>...] [options] [--] <file>
options:
-t, --tarea <tarea> Specific table areas to analyze.
-F, --fill <fill> Fill data in horizontal and/or vertical spanning
cells. Example: -F h, -F v, -F hv
-H, --header <header> Specify header for each table.
-m, --mtol <mtol> Tolerance to account for when merging lines
which are very close. [default: 2]
-j, --jtol <jtol> Tolerance to account for when matching line endings
with intersections. [default: 2]
-b, --blocksize <blocksize> See adaptive threshold doc. [default: 15]
-c, --constant <constant> See adaptive threshold doc. [default: -2]
-C, --constant <constant> See adaptive threshold doc. [default: -2]
-s, --scale <scale> Scaling factor. Large scaling factor leads to
smaller lines being detected. [default: 15]
-I, --iterations <iterations> Number of iterations for dilation. [default: 2]
-i, --invert Invert pdf image to make sure that lines are
in foreground.
-T, --shift_text <shift_text> Specify where the text in a spanning cell
@ -89,41 +92,61 @@ stream_doc = """
Stream method looks for whitespaces between text to form a table.
usage:
camelot stream [-t <tarea>...] [-c <columns>...] [-H <header>...]
[-y <ytol>...] [-m <mtol>...] [options] [--] <file>
camelot stream [-t <tarea>...] [-c <columns>...] [-m <mtol>...]
[-y <ytol>...] [options] [--] <file>
options:
-t, --tarea <tarea> Specific table areas to analyze.
-c, --columns <columns> Comma-separated list of column x-coordinates.
Example: -c 10.1,20.2,30.3
-H, --header <header> Specify header for each table.
-y, --ytol <ytol> Tolerance to account for when grouping rows
together. [default: 2]
-m, --mtol <mtol> Tolerance to account for when merging columns
together. [default: 0]
-y, --ytol <ytol> Tolerance to account for when grouping rows
together. [default: 2]
-d, --debug Debug by visualizing textboxes.
"""
ocr_doc = """
OCR method looks for lines in image based pdfs.
ocrl_doc = """
Lattice, but for images.
usage:
camelot ocr [-t <tarea>] [-m <mtol>] [options] [--] <file>
camelot ocrl [-t <tarea>...] [-m <mtol>...] [options] [--] <file>
options:
-t, --tarea <tarea> Specific table areas to analyze.
-m, --mtol <mtol> Tolerance to account for when merging lines
which are very close. [default: 2]
-b, --blocksize <blocksize> See adaptive threshold doc. [default: 15]
-c, --constant <constant> See adaptive threshold doc. [default: -2]
-D, --dpi <dpi> Dots per inch, specify image quality to be used for OCR.
[default: 300]
-l, --lang <lang> Specify language to be used for OCR. [default: eng]
-s, --scale <scale> Scaling factor. Large scaling factor leads to
smaller lines being detected. [default: 15]
-d, --debug <debug> Debug by visualizing pdf geometry.
(contour,line,joint,table) Example: -d table
-t, --tarea <tarea> Specific table areas to analyze.
-m, --mtol <mtol> Tolerance to account for when merging lines
which are very close. [default: 2]
-b, --blocksize <blocksize> See adaptive threshold doc. [default: 15]
-C, --constant <constant> See adaptive threshold doc. [default: -2]
-D, --dpi <dpi> Dots per inch, specify image quality to be used for OCR.
[default: 300]
-l, --lang <lang> Specify language to be used for OCR. [default: eng]
-s, --scale <scale> Scaling factor. Large scaling factor leads to
smaller lines being detected. [default: 15]
-I, --iterations <iterations> Number of iterations for dilation. [default: 2]
-d, --debug <debug> Debug by visualizing pdf geometry.
(contour,line,joint,table) Example: -d table
"""
ocrs_doc = """
Stream, but for images.
usage:
camelot ocrs [-t <tarea>...] [-c <columns>...] [options] [--] <file>
options:
-t, --tarea <tarea> Specific table areas to analyze.
-c, --columns <columns> Comma-separated list of column x-coordinates.
Example: -c 10.1,20.2,30.3
-b, --blocksize <blocksize> See adaptive threshold doc. [default: 15]
-C, --constant <constant> See adaptive threshold doc. [default: -2]
-N, --line-threshold <line_threshold> Maximum intensity of projections on y-axis.
[default: 100]
-D, --dpi <dpi> Dots per inch, specify image quality to be used for OCR.
[default: 300]
-l, --lang <lang> Specify language to be used for OCR. [default: eng]
-d, --debug Debug by visualizing image.
"""
@ -351,8 +374,10 @@ if __name__ == '__main__':
args.update(docopt(lattice_doc, argv=argv))
elif args['<method>'] == 'stream':
args.update(docopt(stream_doc, argv=argv))
elif args['<method>'] == 'ocr':
args.update(docopt(ocr_doc, argv=argv))
elif args['<method>'] == 'ocrl':
args.update(docopt(ocrl_doc, argv=argv))
elif args['<method>'] == 'ocrs':
args.update(docopt(ocrs_doc, argv=argv))
filename = args['<file>']
filedir = os.path.dirname(args['<file>'])
@ -392,11 +417,12 @@ if __name__ == '__main__':
kwargs = {
'table_area': args['--tarea'] if args['--tarea'] else None,
'fill': args['--fill'] if args['--fill'] else None,
'headers': args['--header'] if args['--header'] else None,
'mtol': [int(m) for m in args['--mtol']],
'jtol': [int(j) for j in args['--jtol']],
'blocksize': int(args['--blocksize']),
'threshold_constant': float(args['--constant']),
'scale': int(args['--scale']),
'iterations': int(args['--iterations']),
'invert': args['--invert'],
'margins': margins,
'split_text': args['--split_text'],
@ -462,7 +488,6 @@ if __name__ == '__main__':
kwargs = {
'table_area': args['--tarea'] if args['--tarea'] else None,
'columns': args['--columns'] if args['--columns'] else None,
'headers': args['--header'] if args['--header'] else None,
'ytol': [int(y) for y in args['--ytol']],
'mtol': [int(m) for m in args['--mtol']],
'margins': margins,
@ -522,7 +547,7 @@ if __name__ == '__main__':
except Exception as e:
logger.exception(e.message, exc_info=True)
sys.exit()
elif args['<method>'] == 'ocr':
elif args['<method>'] == 'ocrl':
try:
kwargs = {
'table_area': args['--tarea'] if args['--tarea'] else None,
@ -532,9 +557,75 @@ if __name__ == '__main__':
'dpi': int(args['--dpi']),
'lang': args['--lang'],
'scale': int(args['--scale']),
'iterations': int(args['--iterations']),
'debug': args['--debug']
}
manager = Pdf(OCR(**kwargs), filename, pagenos=p, clean=True,
manager = Pdf(OCRLattice(**kwargs), filename, pagenos=p, clean=True,
parallel=args['--parallel'])
data = manager.extract()
processing_time = time.time() - start_time
logger.info("Finished processing in " + str(processing_time) + " seconds")
if args['--plot']:
if args['--output']:
pngname = os.path.join(args['--output'], os.path.basename(pngname))
plot_type = args['--plot'].split(',')
if 'page' in plot_type:
for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
page = data[page_number]
for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
table = page[table_number]
plot_table_barchart(table['r_nempty_cells'],
table['c_nempty_cells'],
table['empty_p'],
page_number,
table_number)
if 'all' in plot_type:
plot_all_barchart(data, pngname)
if 'rc' in plot_type:
plot_rc_piechart(data, pngname)
if args['--print-stats']:
print_stats(data, processing_time)
if args['--save-stats']:
if args['--output']:
scorename = os.path.join(args['--output'], os.path.basename(scorename))
with open(scorename, 'w') as score_file:
score_file.write('table,nrows,ncols,empty_p,line_p,text_p,score\n')
for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
page = data[page_number]
for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
table = page[table_number]
score_file.write('{0},{1},{2},{3},{4},{5},{6}\n'.format(
''.join([page_number, '_', table_number]),
table['nrows'],
table['ncols'],
table['empty_p'],
table['line_p'],
table['text_p'],
table['score']))
if args['--debug']:
manager.debug_plot()
except Exception as e:
logger.exception(e.message, exc_info=True)
sys.exit()
elif args['<method>'] == 'ocrs':
try:
kwargs = {
'table_area': args['--tarea'] if args['--tarea'] else None,
'columns': args['--columns'] if args['--columns'] else None,
'blocksize': int(args['--blocksize']),
'threshold_constant': float(args['--constant']),
'line_threshold': int(args['--line-threshold']),
'dpi': int(args['--dpi']),
'lang': args['--lang'],
'debug': args['--debug']
}
manager = Pdf(OCRStream(**kwargs), filename, pagenos=p, clean=True,
parallel=args['--parallel'])
data = manager.extract()
@ -588,7 +679,7 @@ if __name__ == '__main__':
logger.exception(e.message, exc_info=True)
sys.exit()
if args['--debug']:
if args.get('--debug') is not None and args['--debug']:
print("See 'camelot <method> -h' for various parameters you can tweak.")
else:
output = filedir if args['--output'] is None else args['--output']