Add table_area

[MRG] Add table_area
pull/2/head
Vinayak Mehta 2016-09-05 18:51:59 +05:30 committed by GitHub
parent 0bb6ce0bf9
commit d86630e70b
6 changed files with 343 additions and 296 deletions

View File

98
camelot/imgproc.py 100644
View File

@ -0,0 +1,98 @@
import cv2
import numpy as np
def adaptive_threshold(imagename, invert=False):
img = cv2.imread(imagename)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
if invert:
threshold = cv2.adaptiveThreshold(
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY,
15, -0.2)
else:
threshold = cv2.adaptiveThreshold(
np.invert(gray), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY,
15, -0.2)
return img, threshold
def find_lines(threshold, direction=None, scale=15):
lines = []
if direction == 'vertical':
size = threshold.shape[0] // scale
el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size))
elif direction == 'horizontal':
size = threshold.shape[1] // scale
el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1))
elif direction is None:
raise ValueError("Specify direction as either 'vertical' or"
" 'horizontal'")
threshold = cv2.erode(threshold, el, (-1, -1))
threshold = cv2.dilate(threshold, el, (-1, -1))
dmask = threshold # findContours modifies source image
try:
_, contours, _ = cv2.findContours(
threshold, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
except ValueError:
contours, _ = cv2.findContours(
threshold, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
for c in contours:
x, y, w, h = cv2.boundingRect(c)
x1, x2 = x, x + w
y1, y2 = y, y + h
if direction == 'vertical':
lines.append(((x1 + x2) / 2, y2, (x1 + x2) / 2, y1))
elif direction == 'horizontal':
lines.append((x1, (y1 + y2) / 2, x2, (y1 + y2) / 2))
return dmask, lines
def find_table_contours(vertical, horizontal):
mask = vertical + horizontal
try:
__, contours, __ = cv2.findContours(
mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
except ValueError:
contours, __ = cv2.findContours(
mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
cont = []
for c in contours:
c_poly = cv2.approxPolyDP(c, 3, True)
x, y, w, h = cv2.boundingRect(c_poly)
cont.append((x, y, w, h))
return cont
def find_table_joints(contours, vertical, horizontal):
joints = np.bitwise_and(vertical, horizontal)
tables = {}
for c in contours:
x, y, w, h = c
roi = joints[y : y + h, x : x + w]
try:
__, jc, __ = cv2.findContours(
roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
except ValueError:
jc, __ = cv2.findContours(
roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
if len(jc) <= 4: # remove contours with less than 4 joints
continue
joint_coords = []
for j in jc:
jx, jy, jw, jh = cv2.boundingRect(j)
c1, c2 = x + (2 * jx + jw) / 2, y + (2 * jy + jh) / 2
joint_coords.append((c1, c2))
tables[(x, y + h, x + w, y)] = joint_coords
return tables

View File

@ -4,15 +4,15 @@ import types
import copy_reg
import logging
import cv2
import numpy as np
from wand.image import Image
from .imgproc import (adaptive_threshold, find_lines, find_table_contours,
find_table_joints)
from .table import Table
from .utils import (transform, segments_bbox, text_bbox, detect_vertical, merge_close_values,
get_row_index, get_column_index, get_score, reduce_index,
outline, fill_spanning, count_empty, encode_list, pdf_to_text)
from .utils import (scale_to_pdf, scale_to_image, segments_bbox, text_bbox,
detect_vertical, merge_close_values, get_row_index,
get_column_index, get_score, reduce_index, outline,
fill_spanning, count_empty, encode_list, pdf_to_text)
__all__ = ['Lattice']
@ -26,128 +26,6 @@ def _reduce_method(m):
copy_reg.pickle(types.MethodType, _reduce_method)
def _morph_transform(imagename, scale=15, invert=False):
"""Morphological Transformation
Applies a series of morphological operations on the image
to find table contours and line segments.
http://answers.opencv.org/question/63847/how-to-extract-tables-from-an-image/
Empirical result for adaptiveThreshold's blockSize=5 and C=-0.2
taken from http://pequan.lip6.fr/~bereziat/pima/2012/seuillage/sezgin04.pdf
Parameters
----------
imagename : Path to image.
scale : int
Scaling factor. Large scaling factor leads to smaller lines
being detected. (optional, default: 15)
invert : bool
Invert pdf image to make sure that lines are in foreground.
(optional, default: False)
Returns
-------
img : ndarray
tables : dict
Dictionary with table bounding box as key and list of
joints found in the table as value.
v_segments : list
List of vertical line segments found in the image.
h_segments : list
List of horizontal line segments found in the image.
"""
img = cv2.imread(imagename)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
if invert:
threshold = cv2.adaptiveThreshold(
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY,
15, -0.2)
else:
threshold = cv2.adaptiveThreshold(
np.invert(gray), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY,
15, -0.2)
vertical = threshold
horizontal = threshold
verticalsize = vertical.shape[0] // scale
horizontalsize = horizontal.shape[1] // scale
ver = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize))
hor = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontalsize, 1))
vertical = cv2.erode(vertical, ver, (-1, -1))
vertical = cv2.dilate(vertical, ver, (-1, -1))
horizontal = cv2.erode(horizontal, hor, (-1, -1))
horizontal = cv2.dilate(horizontal, hor, (-1, -1))
mask = vertical + horizontal
joints = np.bitwise_and(vertical, horizontal)
try:
__, contours, __ = cv2.findContours(
mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
except ValueError:
contours, __ = cv2.findContours(
mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
tables = {}
for c in contours:
c_poly = cv2.approxPolyDP(c, 3, True)
x, y, w, h = cv2.boundingRect(c_poly)
roi = joints[y : y + h, x : x + w]
try:
__, jc, __ = cv2.findContours(
roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
except ValueError:
jc, __ = cv2.findContours(
roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
if len(jc) <= 4: # remove contours with less than <=4 joints
continue
joint_coords = []
for j in jc:
jx, jy, jw, jh = cv2.boundingRect(j)
c1, c2 = x + (2 * jx + jw) / 2, y + (2 * jy + jh) / 2
joint_coords.append((c1, c2))
tables[(x, y + h, x + w, y)] = joint_coords
v_segments, h_segments = [], []
try:
_, vcontours, _ = cv2.findContours(
vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
except ValueError:
vcontours, _ = cv2.findContours(
vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
for vc in vcontours:
x, y, w, h = cv2.boundingRect(vc)
x1, x2 = x, x + w
y1, y2 = y, y + h
v_segments.append(((x1 + x2) / 2, y2, (x1 + x2) / 2, y1))
try:
_, hcontours, _ = cv2.findContours(
horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
except ValueError:
hcontours, _ = cv2.findContours(
horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
for hc in hcontours:
x, y, w, h = cv2.boundingRect(hc)
x1, x2 = x, x + w
y1, y2 = y, y + h
h_segments.append((x1, (y1 + y2) / 2, x2, (y1 + y2) / 2))
return img, tables, v_segments, h_segments
class Lattice:
"""Lattice algorithm
@ -188,17 +66,17 @@ class Lattice:
Dictionary with page number as key and list of tables on that
page as value.
"""
def __init__(self, fill=None, scale=15, jtol=2, mtol=2,
invert=False, pdf_margin=(2.0, 0.5, 0.1), debug=None):
def __init__(self, table_area=None, fill=None, jtol=[2], mtol=[2], scale=15,
invert=False, margins=(2.0, 0.5, 0.1), debug=None):
self.method = 'lattice'
self.table_area = table_area
self.fill = fill
self.scale = scale
self.jtol = jtol
self.mtol = mtol
self.scale = scale
self.invert = invert
self.char_margin, self.line_margin, self.word_margin = pdf_margin
self.char_margin, self.line_margin, self.word_margin = margins
self.debug = debug
def get_tables(self, pdfname):
@ -217,48 +95,79 @@ class Lattice:
logging.warning("{0}: PDF has no text. It may be an image.".format(
os.path.basename(bname)))
return None
imagename = ''.join([bname, '.png'])
with Image(filename=pdfname, depth=8, resolution=300) as png:
png.save(filename=imagename)
img, threshold = adaptive_threshold(imagename, invert=self.invert)
pdf_x = width
pdf_y = height
img, table_bbox, v_segments, h_segments = _morph_transform(
imagename, scale=self.scale, invert=self.invert)
img_x = img.shape[1]
img_y = img.shape[0]
scaling_factor_x = pdf_x / float(img_x)
scaling_factor_y = pdf_y / float(img_y)
sc_x_image = img_x / float(pdf_x)
sc_y_image = img_y / float(pdf_y)
sc_x_pdf = pdf_x / float(img_x)
sc_y_pdf = pdf_y / float(img_y)
factors_image = (sc_x_image, sc_y_image, pdf_y)
factors_pdf = (sc_x_pdf, sc_y_pdf, img_y)
vmask, v_segments = find_lines(threshold, direction='vertical',
scale=self.scale)
hmask, h_segments = find_lines(threshold, direction='horizontal',
scale=self.scale)
if self.table_area:
if self.fill:
if len(self.table_area) != len(self.fill):
raise ValueError("message")
if len(self.jtol) == 1 and self.jtol[0] == 2:
self.jtol = self.jtol * len(self.table_area)
if len(self.mtol) == 1 and self.mtol[0] == 2:
self.mtol = self.mtol * len(self.table_area)
areas = []
for area in self.table_area:
x1, y1, x2, y2 = area.split(",")
x1 = int(x1)
y1 = int(y1)
x2 = int(x2)
y2 = int(y2)
x1, y1, x2, y2 = scale_to_image((x1, y1, x2, y2), factors_image)
areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1)))
table_bbox = find_table_joints(areas, vmask, hmask)
else:
contours = find_table_contours(vmask, hmask)
table_bbox = find_table_joints(contours, vmask, hmask)
if self.debug:
self.debug_images = (img, table_bbox)
factors = (scaling_factor_x, scaling_factor_y, img_y)
table_bbox, v_segments, h_segments = transform(table_bbox, v_segments,
h_segments, factors)
table_bbox, v_segments, h_segments = scale_to_pdf(table_bbox, v_segments,
h_segments, factors_pdf)
if self.debug:
self.debug_segments = (v_segments, h_segments)
self.debug_tables = []
pdf_page = {}
page_tables = {}
table_no = 1
page = {}
tables = {}
table_no = 0
# sort tables based on y-coord
for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True):
# select edges which lie within table_bbox
table_info = {}
# select elements which lie within table_bbox
table_data = {}
v_s, h_s = segments_bbox(k, v_segments, h_segments)
t_bbox = text_bbox(k, text)
table_info['text_p'] = 100 * (1 - (len(t_bbox) / len(text)))
table_data['text_p'] = 100 * (1 - (len(t_bbox) / len(text)))
table_rotation = detect_vertical(t_bbox)
cols, rows = zip(*table_bbox[k])
cols, rows = list(cols), list(rows)
cols.extend([k[0], k[2]])
rows.extend([k[1], k[3]])
# sort horizontal and vertical segments
cols = merge_close_values(sorted(cols), mtol=self.mtol)
cols = merge_close_values(sorted(cols), mtol=self.mtol[table_no])
rows = merge_close_values(
sorted(rows, reverse=True), mtol=self.mtol)
sorted(rows, reverse=True), mtol=self.mtol[table_no])
# make grid using x and y coord of shortlisted rows and cols
cols = [(cols[i], cols[i + 1])
for i in range(0, len(cols) - 1)]
@ -266,9 +175,9 @@ class Lattice:
for i in range(0, len(rows) - 1)]
table = Table(cols, rows)
# set table edges to True using ver+hor lines
table = table.set_edges(v_s, h_s, jtol=self.jtol)
table = table.set_edges(v_s, h_s, jtol=self.jtol[table_no])
nouse = table.nocont_ / (len(v_s) + len(h_s))
table_info['line_p'] = 100 * (1 - nouse)
table_data['line_p'] = 100 * (1 - nouse)
# set spanning cells to True
table = table.set_spanning()
# set table border edges to True
@ -314,10 +223,10 @@ class Lattice:
for t in t_bbox]))
score = get_score([[50, rerror], [50, cerror]])
table_info['score'] = score
table_data['score'] = score
if self.fill is not None:
table = fill_spanning(table, fill=self.fill)
if self.fill:
table = fill_spanning(table, fill=self.fill[table_no])
ar = table.get_list()
if table_rotation == 'left':
ar = zip(*ar[::-1])
@ -325,18 +234,18 @@ class Lattice:
ar = zip(*ar[::1])
ar.reverse()
ar = encode_list(ar)
table_info['data'] = ar
table_data['data'] = ar
empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)
table_info['empty_p'] = empty_p
table_info['r_nempty_cells'] = r_nempty_cells
table_info['c_nempty_cells'] = c_nempty_cells
table_info['nrows'] = len(ar)
table_info['ncols'] = len(ar[0])
page_tables['table_{0}'.format(table_no)] = table_info
table_data['empty_p'] = empty_p
table_data['r_nempty_cells'] = r_nempty_cells
table_data['c_nempty_cells'] = c_nempty_cells
table_data['nrows'] = len(ar)
table_data['ncols'] = len(ar[0])
tables['table-{0}'.format(table_no + 1)] = table_data
table_no += 1
pdf_page[os.path.basename(bname)] = page_tables
page[os.path.basename(bname)] = tables
if self.debug:
return None
return pdf_page
return page

View File

@ -7,7 +7,8 @@ import logging
import numpy as np
from .table import Table
from .utils import get_row_index, get_score, count_empty, encode_list, pdf_to_text
from .utils import (get_row_index, get_score, count_empty, encode_list,
pdf_to_text, text_bbox)
__all__ = ['Stream']
@ -133,6 +134,17 @@ def _get_column_index(t, columns):
return c_idx, error
def _join_rows(rows_grouped, text_y_max, text_y_min):
row_mids = [sum([(t.y0 + t.y1) / 2 for t in r]) / len(r)
if len(r) > 0 else 0 for r in rows_grouped]
rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))]
rows.insert(0, text_y_max)
rows.append(text_y_min)
rows = [(rows[i], rows[i + 1])
for i in range(0, len(rows) - 1)]
return rows
def _add_columns(cols, text, ytolerance):
if text:
text = _group_rows(text, ytol=ytolerance)
@ -143,14 +155,6 @@ def _add_columns(cols, text, ytolerance):
return cols
def _get_table_bounds(rows):
x0 = min([t.x0 for r in rows for t in r])
x1 = max([t.x1 for r in rows for t in r])
y0 = min([t.y0 for t in rows[-1]])
y1 = max([t.y1 for t in rows[0]])
return x0, x1, y0, y1
def _join_columns(cols, text_x_min, text_x_max):
cols = sorted(cols)
cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))]
@ -194,16 +198,16 @@ class Stream:
Dictionary with page number as key and list of tables on that
page as value.
"""
def __init__(self, ncolumns=0, columns=None, ytol=2, mtol=2,
pdf_margin=(2.0, 0.5, 0.1), debug=False):
def __init__(self, table_area=None, columns=None, ncolumns=None, ytol=[2],
mtol=[2], margins=(2.0, 0.5, 0.1), debug=False):
self.method = 'stream'
self.ncolumns = ncolumns
self.table_area = table_area
self.columns = columns
self.ncolumns = ncolumns
self.ytol = ytol
self.mtol = mtol
self.char_margin, self.line_margin, self.word_margin = pdf_margin
self.char_margin, self.line_margin, self.word_margin = margins
self.debug = debug
def get_tables(self, pdfname):
@ -222,106 +226,126 @@ class Stream:
logging.warning("{0}: PDF has no text. It may be an image.".format(
os.path.basename(bname)))
return None
text.sort(key=lambda x: (-x.y0, x.x0))
if self.debug:
self.debug_text = [(t.x0, t.y0, t.x1, t.y1) for t in text]
return None
rows_grouped = _group_rows(text, ytol=self.ytol)
elements = [len(r) for r in rows_grouped]
row_mids = [sum([(t.y0 + t.y1) / 2 for t in r]) / len(r)
if len(r) > 0 else 0 for r in rows_grouped]
rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))]
bounds = _get_table_bounds(rows_grouped)
rows.insert(0, bounds[3])
rows.append(bounds[2])
rows = [(rows[i], rows[i + 1])
for i in range(0, len(rows) - 1)]
guess = False
if self.columns:
# user has to input boundary columns too
# take (0, width) by default
# similar to else condition
# len can't be 1
cols = self.columns.split(',')
cols = [(float(cols[i]), float(cols[i + 1]))
for i in range(0, len(cols) - 1)]
else:
if self.table_area:
if self.columns:
if len(self.table_area) != len(self.columns):
raise ValueError("message")
if self.ncolumns:
ncols = self.ncolumns
cols = [(t.x0, t.x1)
for r in rows_grouped if len(r) == ncols for t in r]
cols = _merge_columns(sorted(cols), mtol=self.mtol)
if len(cols) != self.ncolumns:
logging.warning("{}: The number of columns after merge"
" isn't the same as what you specified."
" Change the value of mtol.".format(
os.path.basename(bname)))
cols = _join_columns(cols, bounds[0], bounds[1])
else:
guess = True
ncols = max(set(elements), key=elements.count)
len_non_mode = len(filter(lambda x: x != ncols, elements))
if ncols == 1 and not self.debug:
# no tables detected
logging.warning("{}: Only one column was detected, the PDF"
" may have no tables. Specify ncols if"
" the PDF has tables.".format(
os.path.basename(bname)))
cols = [(t.x0, t.x1)
for r in rows_grouped if len(r) == ncols for t in r]
cols = _merge_columns(sorted(cols), mtol=self.mtol)
inner_text = []
for i in range(1, len(cols)):
left = cols[i - 1][1]
right = cols[i][0]
inner_text.extend([t for t in text if t.x0 > left and t.x1 < right])
outer_text = [t for t in text if t.x0 > cols[-1][1] or t.x1 < cols[0][0]]
inner_text.extend(outer_text)
cols = _add_columns(cols, inner_text, self.ytol)
cols = _join_columns(cols, bounds[0], bounds[1])
pdf_page = {}
page_tables = {}
table_info = {}
table = Table(cols, rows)
rerror = []
cerror = []
for row in rows_grouped:
for t in row:
try:
r_idx, rass_error = get_row_index(t, rows)
except ValueError as e:
# couldn't assign LTTextLH to any cell
vprint(e.message)
continue
try:
c_idx, cass_error = _get_column_index(t, cols)
except ValueError as e:
# couldn't assign LTTextLH to any cell
vprint(e.message)
continue
rerror.append(rass_error)
cerror.append(cass_error)
table.cells[r_idx][c_idx].add_text(
t.get_text().strip('\n'))
if guess:
score = get_score([[33, rerror], [33, cerror], [34, [len_non_mode / len(elements)]]])
if len(self.table_area) != len(self.ncolumns):
raise ValueError("message")
if len(self.ytol) == 1 and self.ytol[0] == 2:
self.ytol = self.ytol * len(self.table_area)
if len(self.mtol) == 1 and self.mtol[0] == 2:
self.mtol = self.mtol * len(self.table_area)
table_bbox = {}
for area in self.table_area:
x1, y1, x2, y2 = area.split(",")
x1 = int(x1)
y1 = int(y1)
x2 = int(x2)
y2 = int(y2)
table_bbox[(x1, y2, x2, y1)] = None
else:
score = get_score([[50, rerror], [50, cerror]])
table_info['score'] = score
ar = table.get_list()
ar = encode_list(ar)
table_info['data'] = ar
empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)
table_info['empty_p'] = empty_p
table_info['r_nempty_cells'] = r_nempty_cells
table_info['c_nempty_cells'] = c_nempty_cells
table_info['nrows'] = len(ar)
table_info['ncols'] = len(ar[0])
page_tables['table_1'] = table_info
pdf_page[os.path.basename(bname)] = page_tables
table_bbox = {(0, height, width, 0): None}
return pdf_page
page = {}
tables = {}
table_no = 0
# sort tables based on y-coord
for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True):
# select elements which lie within table_bbox
table_data = {}
t_bbox = text_bbox(k, text)
t_bbox.sort(key=lambda x: (-x.y0, x.x0))
rows_grouped = _group_rows(t_bbox, ytol=self.ytol[table_no])
rows = _join_rows(rows_grouped, k[3], k[1])
elements = [len(r) for r in rows_grouped]
guess = False
if self.columns and self.columns[table_no] != "":
# user has to input boundary columns too
# take (0, width) by default
# similar to else condition
# len can't be 1
cols = self.columns[table_no].split(',')
cols = [(float(cols[i]), float(cols[i + 1]))
for i in range(0, len(cols) - 1)]
else:
if self.ncolumns and self.ncolumns[table_no] != -1:
ncols = self.ncolumns[table_no]
cols = [(t.x0, t.x1)
for r in rows_grouped if len(r) == ncols for t in r]
cols = _merge_columns(sorted(cols), mtol=self.mtol[table_no])
if len(cols) != self.ncolumns[table_no]:
logging.warning("{}: The number of columns after merge"
" isn't the same as what you specified."
" Change the value of mtol.".format(
os.path.basename(bname)))
cols = _join_columns(cols, k[0], k[2])
else:
guess = True
ncols = max(set(elements), key=elements.count)
len_non_mode = len(filter(lambda x: x != ncols, elements))
if ncols == 1 and not self.debug:
# no tables detected
logging.warning("{}: Only one column was detected, the PDF"
" may have no tables. Specify ncols if"
" the PDF has tables.".format(
os.path.basename(bname)))
cols = [(t.x0, t.x1)
for r in rows_grouped if len(r) == ncols for t in r]
cols = _merge_columns(sorted(cols), mtol=self.mtol[table_no])
inner_text = []
for i in range(1, len(cols)):
left = cols[i - 1][1]
right = cols[i][0]
inner_text.extend([t for t in text if t.x0 > left and t.x1 < right])
outer_text = [t for t in text if t.x0 > cols[-1][1] or t.x1 < cols[0][0]]
inner_text.extend(outer_text)
cols = _add_columns(cols, inner_text, self.ytol[table_no])
cols = _join_columns(cols, k[0], k[2])
table = Table(cols, rows)
rerror = []
cerror = []
for row in rows_grouped:
for t in row:
try:
r_idx, rass_error = get_row_index(t, rows)
except ValueError as e:
# couldn't assign LTTextLH to any cell
continue
try:
c_idx, cass_error = _get_column_index(t, cols)
except ValueError as e:
# couldn't assign LTTextLH to any cell
continue
rerror.append(rass_error)
cerror.append(cass_error)
table.cells[r_idx][c_idx].add_text(
t.get_text().strip('\n'))
if guess:
score = get_score([[33, rerror], [33, cerror], [34, [len_non_mode / len(elements)]]])
else:
score = get_score([[50, rerror], [50, cerror]])
table_data['score'] = score
ar = encode_list(table.get_list())
table_data['data'] = ar
empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)
table_data['empty_p'] = empty_p
table_data['r_nempty_cells'] = r_nempty_cells
table_data['c_nempty_cells'] = c_nempty_cells
table_data['nrows'] = len(ar)
table_data['ncols'] = len(ar[0])
tables['table-{0}'.format(table_no + 1)] = table_data
table_no += 1
page[os.path.basename(bname)] = tables
return page

View File

@ -81,7 +81,17 @@ def rotate(x1, y1, x2, y2, angle):
return xnew, ynew
def transform(tables, v_segments, h_segments, factors):
def scale_to_image(k, factors):
x1, y1, x2, y2 = k
scaling_factor_x, scaling_factor_y, pdf_y = factors
x1 = scale(x1, scaling_factor_x)
y1 = scale(abs(translate(-pdf_y, y1)), scaling_factor_y)
x2 = scale(x2, scaling_factor_x)
y2 = scale(abs(translate(-pdf_y, y2)), scaling_factor_y)
return int(x1), int(y1), int(x2), int(y2)
def scale_to_pdf(tables, v_segments, h_segments, factors):
"""Translates and scales OpenCV coordinates to PDFMiner coordinate
space.

View File

@ -40,9 +40,9 @@ options:
-W, --wmargin <wmargin> Word margin. Insert blank spaces between chars
if distance between words is greater than word
margin. [default: 0.1]
-S, --save-info Save parsing info for each page to a file.
-S, --print-stats List stats on the parsing process.
-T, --save-stats Save stats to a file.
-X, --plot <dist> Plot distributions. (page,all,rc)
-Z, --summary Summarize metrics.
camelot methods:
lattice Looks for lines between data.
@ -55,19 +55,21 @@ lattice_doc = """
Lattice method looks for lines between text to form a table.
usage:
camelot lattice [options] [--] <file>
camelot lattice [-t <tarea>...] [-F <fill>...] [-j <jtol>...]
[-m <mtol>...] [options] [--] <file>
options:
-t, --tarea <tarea> Specific table areas to analyze.
-F, --fill <fill> Fill data in horizontal and/or vertical spanning
cells. Example: -F h, -F v, -F hv
-s, --scale <scale> Scaling factor. Large scaling factor leads to
smaller lines being detected. [default: 15]
-i, --invert Invert pdf image to make sure that lines are
in foreground.
-j, --jtol <jtol> Tolerance to account for when comparing joint
and line coordinates. [default: 2]
-m, --mtol <mtol> Tolerance to account for when merging lines
which are very close. [default: 2]
-s, --scale <scale> Scaling factor. Large scaling factor leads to
smaller lines being detected. [default: 15]
-i, --invert Invert pdf image to make sure that lines are
in foreground.
-d, --debug <debug> Debug by visualizing pdf geometry.
(contour,line,joint,table) Example: -d table
"""
@ -76,12 +78,14 @@ stream_doc = """
Stream method looks for whitespaces between text to form a table.
usage:
camelot stream [options] [--] <file>
camelot stream [-t <tarea>...] [-c <columns>...] [-n <ncols>...] [-y <ytol>...]
[-m <mtol>...] [options] [--] <file>
options:
-n, --ncols <ncols> Number of columns. [default: 0]
-t, --tarea <tarea> Specific table areas to analyze.
-c, --columns <columns> Comma-separated list of column x-coordinates.
Example: -c 10.1,20.2,30.3
-n, --ncols <ncols> Number of columns. [default: -1]
-y, --ytol <ytol> Tolerance to account for when grouping rows
together. [default: 2]
-m, --mtol <mtol> Tolerance to account for when merging columns
@ -166,7 +170,7 @@ def plot_rc_piechart(data, output):
plt.savefig(''.join([output, '_rc.png']), dpi=300)
def summary(data, p_time):
def print_stats(data, p_time):
from operator import itemgetter
from itertools import groupby
@ -331,17 +335,18 @@ if __name__ == '__main__':
else:
p.append({'start': int(r), 'end': int(r)})
margin_tuple = (float(args['--cmargin']), float(args['--lmargin']),
margins = (float(args['--cmargin']), float(args['--lmargin']),
float(args['--wmargin']))
if args['<method>'] == 'lattice':
try:
manager = Pdf(Lattice(
table_area=args['--tarea'],
fill=args['--fill'],
jtol=[int(j) for j in args['--jtol']],
mtol=[int(m) for m in args['--mtol']],
scale=int(args['--scale']),
invert=args['--invert'],
jtol=int(args['--jtol']),
mtol=int(args['--mtol']),
pdf_margin=margin_tuple,
margins=margins,
debug=args['--debug']),
filename,
pagenos=p,
@ -374,10 +379,10 @@ if __name__ == '__main__':
if 'rc' in plot_type:
plot_rc_piechart(data, pngname)
if args['--summary']:
summary(data, processing_time)
if args['--print-stats']:
print_stats(data, processing_time)
if args['--save-info']:
if args['--save-stats']:
if args['--output']:
scorename = os.path.join(args['--output'], os.path.basename(scorename))
with open(scorename, 'w') as score_file:
@ -402,11 +407,12 @@ if __name__ == '__main__':
elif args['<method>'] == 'stream':
try:
manager = Pdf(Stream(
ncolumns=int(args['--ncols']),
table_area=args['--tarea'],
columns=args['--columns'],
ytol=int(args['--ytol']),
mtol=int(args['--mtol']),
pdf_margin=margin_tuple,
ncolumns=[int(nc) for nc in args['--ncols']],
ytol=[int(y) for y in args['--ytol']],
mtol=[int(m) for m in args['--mtol']],
margins=margins,
debug=args['--debug']),
filename,
pagenos=p,
@ -439,10 +445,10 @@ if __name__ == '__main__':
if 'rc' in plot_type:
plot_rc_piechart(data, pngname)
if args['--summary']:
summary(data, processing_time)
if args['--print-stats']:
print_stats(data, processing_time)
if args['--save-info']:
if args['--save-stats']:
if args['--output']:
scorename = os.path.join(args['--output'], os.path.basename(scorename))
with open(scorename, 'w') as score_file: