Add table_area

[MRG] Add table_area
pull/2/head
Vinayak Mehta 2016-09-05 18:51:59 +05:30 committed by GitHub
parent 0bb6ce0bf9
commit d86630e70b
6 changed files with 343 additions and 296 deletions

View File

98
camelot/imgproc.py 100644
View File

@ -0,0 +1,98 @@
import cv2
import numpy as np
def adaptive_threshold(imagename, invert=False):
img = cv2.imread(imagename)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
if invert:
threshold = cv2.adaptiveThreshold(
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY,
15, -0.2)
else:
threshold = cv2.adaptiveThreshold(
np.invert(gray), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY,
15, -0.2)
return img, threshold
def find_lines(threshold, direction=None, scale=15):
lines = []
if direction == 'vertical':
size = threshold.shape[0] // scale
el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size))
elif direction == 'horizontal':
size = threshold.shape[1] // scale
el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1))
elif direction is None:
raise ValueError("Specify direction as either 'vertical' or"
" 'horizontal'")
threshold = cv2.erode(threshold, el, (-1, -1))
threshold = cv2.dilate(threshold, el, (-1, -1))
dmask = threshold # findContours modifies source image
try:
_, contours, _ = cv2.findContours(
threshold, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
except ValueError:
contours, _ = cv2.findContours(
threshold, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
for c in contours:
x, y, w, h = cv2.boundingRect(c)
x1, x2 = x, x + w
y1, y2 = y, y + h
if direction == 'vertical':
lines.append(((x1 + x2) / 2, y2, (x1 + x2) / 2, y1))
elif direction == 'horizontal':
lines.append((x1, (y1 + y2) / 2, x2, (y1 + y2) / 2))
return dmask, lines
def find_table_contours(vertical, horizontal):
mask = vertical + horizontal
try:
__, contours, __ = cv2.findContours(
mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
except ValueError:
contours, __ = cv2.findContours(
mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
cont = []
for c in contours:
c_poly = cv2.approxPolyDP(c, 3, True)
x, y, w, h = cv2.boundingRect(c_poly)
cont.append((x, y, w, h))
return cont
def find_table_joints(contours, vertical, horizontal):
joints = np.bitwise_and(vertical, horizontal)
tables = {}
for c in contours:
x, y, w, h = c
roi = joints[y : y + h, x : x + w]
try:
__, jc, __ = cv2.findContours(
roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
except ValueError:
jc, __ = cv2.findContours(
roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
if len(jc) <= 4: # remove contours with less than 4 joints
continue
joint_coords = []
for j in jc:
jx, jy, jw, jh = cv2.boundingRect(j)
c1, c2 = x + (2 * jx + jw) / 2, y + (2 * jy + jh) / 2
joint_coords.append((c1, c2))
tables[(x, y + h, x + w, y)] = joint_coords
return tables

View File

@ -4,15 +4,15 @@ import types
import copy_reg import copy_reg
import logging import logging
import cv2
import numpy as np
from wand.image import Image from wand.image import Image
from .imgproc import (adaptive_threshold, find_lines, find_table_contours,
find_table_joints)
from .table import Table from .table import Table
from .utils import (transform, segments_bbox, text_bbox, detect_vertical, merge_close_values, from .utils import (scale_to_pdf, scale_to_image, segments_bbox, text_bbox,
get_row_index, get_column_index, get_score, reduce_index, detect_vertical, merge_close_values, get_row_index,
outline, fill_spanning, count_empty, encode_list, pdf_to_text) get_column_index, get_score, reduce_index, outline,
fill_spanning, count_empty, encode_list, pdf_to_text)
__all__ = ['Lattice'] __all__ = ['Lattice']
@ -26,128 +26,6 @@ def _reduce_method(m):
copy_reg.pickle(types.MethodType, _reduce_method) copy_reg.pickle(types.MethodType, _reduce_method)
def _morph_transform(imagename, scale=15, invert=False):
"""Morphological Transformation
Applies a series of morphological operations on the image
to find table contours and line segments.
http://answers.opencv.org/question/63847/how-to-extract-tables-from-an-image/
Empirical result for adaptiveThreshold's blockSize=5 and C=-0.2
taken from http://pequan.lip6.fr/~bereziat/pima/2012/seuillage/sezgin04.pdf
Parameters
----------
imagename : Path to image.
scale : int
Scaling factor. Large scaling factor leads to smaller lines
being detected. (optional, default: 15)
invert : bool
Invert pdf image to make sure that lines are in foreground.
(optional, default: False)
Returns
-------
img : ndarray
tables : dict
Dictionary with table bounding box as key and list of
joints found in the table as value.
v_segments : list
List of vertical line segments found in the image.
h_segments : list
List of horizontal line segments found in the image.
"""
img = cv2.imread(imagename)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
if invert:
threshold = cv2.adaptiveThreshold(
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY,
15, -0.2)
else:
threshold = cv2.adaptiveThreshold(
np.invert(gray), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY,
15, -0.2)
vertical = threshold
horizontal = threshold
verticalsize = vertical.shape[0] // scale
horizontalsize = horizontal.shape[1] // scale
ver = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize))
hor = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontalsize, 1))
vertical = cv2.erode(vertical, ver, (-1, -1))
vertical = cv2.dilate(vertical, ver, (-1, -1))
horizontal = cv2.erode(horizontal, hor, (-1, -1))
horizontal = cv2.dilate(horizontal, hor, (-1, -1))
mask = vertical + horizontal
joints = np.bitwise_and(vertical, horizontal)
try:
__, contours, __ = cv2.findContours(
mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
except ValueError:
contours, __ = cv2.findContours(
mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
tables = {}
for c in contours:
c_poly = cv2.approxPolyDP(c, 3, True)
x, y, w, h = cv2.boundingRect(c_poly)
roi = joints[y : y + h, x : x + w]
try:
__, jc, __ = cv2.findContours(
roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
except ValueError:
jc, __ = cv2.findContours(
roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
if len(jc) <= 4: # remove contours with less than <=4 joints
continue
joint_coords = []
for j in jc:
jx, jy, jw, jh = cv2.boundingRect(j)
c1, c2 = x + (2 * jx + jw) / 2, y + (2 * jy + jh) / 2
joint_coords.append((c1, c2))
tables[(x, y + h, x + w, y)] = joint_coords
v_segments, h_segments = [], []
try:
_, vcontours, _ = cv2.findContours(
vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
except ValueError:
vcontours, _ = cv2.findContours(
vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
for vc in vcontours:
x, y, w, h = cv2.boundingRect(vc)
x1, x2 = x, x + w
y1, y2 = y, y + h
v_segments.append(((x1 + x2) / 2, y2, (x1 + x2) / 2, y1))
try:
_, hcontours, _ = cv2.findContours(
horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
except ValueError:
hcontours, _ = cv2.findContours(
horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
for hc in hcontours:
x, y, w, h = cv2.boundingRect(hc)
x1, x2 = x, x + w
y1, y2 = y, y + h
h_segments.append((x1, (y1 + y2) / 2, x2, (y1 + y2) / 2))
return img, tables, v_segments, h_segments
class Lattice: class Lattice:
"""Lattice algorithm """Lattice algorithm
@ -188,17 +66,17 @@ class Lattice:
Dictionary with page number as key and list of tables on that Dictionary with page number as key and list of tables on that
page as value. page as value.
""" """
def __init__(self, table_area=None, fill=None, jtol=[2], mtol=[2], scale=15,
def __init__(self, fill=None, scale=15, jtol=2, mtol=2, invert=False, margins=(2.0, 0.5, 0.1), debug=None):
invert=False, pdf_margin=(2.0, 0.5, 0.1), debug=None):
self.method = 'lattice' self.method = 'lattice'
self.table_area = table_area
self.fill = fill self.fill = fill
self.scale = scale
self.jtol = jtol self.jtol = jtol
self.mtol = mtol self.mtol = mtol
self.scale = scale
self.invert = invert self.invert = invert
self.char_margin, self.line_margin, self.word_margin = pdf_margin self.char_margin, self.line_margin, self.word_margin = margins
self.debug = debug self.debug = debug
def get_tables(self, pdfname): def get_tables(self, pdfname):
@ -217,48 +95,79 @@ class Lattice:
logging.warning("{0}: PDF has no text. It may be an image.".format( logging.warning("{0}: PDF has no text. It may be an image.".format(
os.path.basename(bname))) os.path.basename(bname)))
return None return None
imagename = ''.join([bname, '.png']) imagename = ''.join([bname, '.png'])
with Image(filename=pdfname, depth=8, resolution=300) as png: with Image(filename=pdfname, depth=8, resolution=300) as png:
png.save(filename=imagename) png.save(filename=imagename)
img, threshold = adaptive_threshold(imagename, invert=self.invert)
pdf_x = width pdf_x = width
pdf_y = height pdf_y = height
img, table_bbox, v_segments, h_segments = _morph_transform(
imagename, scale=self.scale, invert=self.invert)
img_x = img.shape[1] img_x = img.shape[1]
img_y = img.shape[0] img_y = img.shape[0]
scaling_factor_x = pdf_x / float(img_x) sc_x_image = img_x / float(pdf_x)
scaling_factor_y = pdf_y / float(img_y) sc_y_image = img_y / float(pdf_y)
sc_x_pdf = pdf_x / float(img_x)
sc_y_pdf = pdf_y / float(img_y)
factors_image = (sc_x_image, sc_y_image, pdf_y)
factors_pdf = (sc_x_pdf, sc_y_pdf, img_y)
vmask, v_segments = find_lines(threshold, direction='vertical',
scale=self.scale)
hmask, h_segments = find_lines(threshold, direction='horizontal',
scale=self.scale)
if self.table_area:
if self.fill:
if len(self.table_area) != len(self.fill):
raise ValueError("message")
if len(self.jtol) == 1 and self.jtol[0] == 2:
self.jtol = self.jtol * len(self.table_area)
if len(self.mtol) == 1 and self.mtol[0] == 2:
self.mtol = self.mtol * len(self.table_area)
areas = []
for area in self.table_area:
x1, y1, x2, y2 = area.split(",")
x1 = int(x1)
y1 = int(y1)
x2 = int(x2)
y2 = int(y2)
x1, y1, x2, y2 = scale_to_image((x1, y1, x2, y2), factors_image)
areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1)))
table_bbox = find_table_joints(areas, vmask, hmask)
else:
contours = find_table_contours(vmask, hmask)
table_bbox = find_table_joints(contours, vmask, hmask)
if self.debug: if self.debug:
self.debug_images = (img, table_bbox) self.debug_images = (img, table_bbox)
factors = (scaling_factor_x, scaling_factor_y, img_y) table_bbox, v_segments, h_segments = scale_to_pdf(table_bbox, v_segments,
table_bbox, v_segments, h_segments = transform(table_bbox, v_segments, h_segments, factors_pdf)
h_segments, factors)
if self.debug: if self.debug:
self.debug_segments = (v_segments, h_segments) self.debug_segments = (v_segments, h_segments)
self.debug_tables = [] self.debug_tables = []
pdf_page = {} page = {}
page_tables = {} tables = {}
table_no = 1 table_no = 0
# sort tables based on y-coord # sort tables based on y-coord
for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True): for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True):
# select edges which lie within table_bbox # select elements which lie within table_bbox
table_info = {} table_data = {}
v_s, h_s = segments_bbox(k, v_segments, h_segments) v_s, h_s = segments_bbox(k, v_segments, h_segments)
t_bbox = text_bbox(k, text) t_bbox = text_bbox(k, text)
table_info['text_p'] = 100 * (1 - (len(t_bbox) / len(text))) table_data['text_p'] = 100 * (1 - (len(t_bbox) / len(text)))
table_rotation = detect_vertical(t_bbox) table_rotation = detect_vertical(t_bbox)
cols, rows = zip(*table_bbox[k]) cols, rows = zip(*table_bbox[k])
cols, rows = list(cols), list(rows) cols, rows = list(cols), list(rows)
cols.extend([k[0], k[2]]) cols.extend([k[0], k[2]])
rows.extend([k[1], k[3]]) rows.extend([k[1], k[3]])
# sort horizontal and vertical segments # sort horizontal and vertical segments
cols = merge_close_values(sorted(cols), mtol=self.mtol) cols = merge_close_values(sorted(cols), mtol=self.mtol[table_no])
rows = merge_close_values( rows = merge_close_values(
sorted(rows, reverse=True), mtol=self.mtol) sorted(rows, reverse=True), mtol=self.mtol[table_no])
# make grid using x and y coord of shortlisted rows and cols # make grid using x and y coord of shortlisted rows and cols
cols = [(cols[i], cols[i + 1]) cols = [(cols[i], cols[i + 1])
for i in range(0, len(cols) - 1)] for i in range(0, len(cols) - 1)]
@ -266,9 +175,9 @@ class Lattice:
for i in range(0, len(rows) - 1)] for i in range(0, len(rows) - 1)]
table = Table(cols, rows) table = Table(cols, rows)
# set table edges to True using ver+hor lines # set table edges to True using ver+hor lines
table = table.set_edges(v_s, h_s, jtol=self.jtol) table = table.set_edges(v_s, h_s, jtol=self.jtol[table_no])
nouse = table.nocont_ / (len(v_s) + len(h_s)) nouse = table.nocont_ / (len(v_s) + len(h_s))
table_info['line_p'] = 100 * (1 - nouse) table_data['line_p'] = 100 * (1 - nouse)
# set spanning cells to True # set spanning cells to True
table = table.set_spanning() table = table.set_spanning()
# set table border edges to True # set table border edges to True
@ -314,10 +223,10 @@ class Lattice:
for t in t_bbox])) for t in t_bbox]))
score = get_score([[50, rerror], [50, cerror]]) score = get_score([[50, rerror], [50, cerror]])
table_info['score'] = score table_data['score'] = score
if self.fill is not None: if self.fill:
table = fill_spanning(table, fill=self.fill) table = fill_spanning(table, fill=self.fill[table_no])
ar = table.get_list() ar = table.get_list()
if table_rotation == 'left': if table_rotation == 'left':
ar = zip(*ar[::-1]) ar = zip(*ar[::-1])
@ -325,18 +234,18 @@ class Lattice:
ar = zip(*ar[::1]) ar = zip(*ar[::1])
ar.reverse() ar.reverse()
ar = encode_list(ar) ar = encode_list(ar)
table_info['data'] = ar table_data['data'] = ar
empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar) empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)
table_info['empty_p'] = empty_p table_data['empty_p'] = empty_p
table_info['r_nempty_cells'] = r_nempty_cells table_data['r_nempty_cells'] = r_nempty_cells
table_info['c_nempty_cells'] = c_nempty_cells table_data['c_nempty_cells'] = c_nempty_cells
table_info['nrows'] = len(ar) table_data['nrows'] = len(ar)
table_info['ncols'] = len(ar[0]) table_data['ncols'] = len(ar[0])
page_tables['table_{0}'.format(table_no)] = table_info tables['table-{0}'.format(table_no + 1)] = table_data
table_no += 1 table_no += 1
pdf_page[os.path.basename(bname)] = page_tables page[os.path.basename(bname)] = tables
if self.debug: if self.debug:
return None return None
return pdf_page return page

View File

@ -7,7 +7,8 @@ import logging
import numpy as np import numpy as np
from .table import Table from .table import Table
from .utils import get_row_index, get_score, count_empty, encode_list, pdf_to_text from .utils import (get_row_index, get_score, count_empty, encode_list,
pdf_to_text, text_bbox)
__all__ = ['Stream'] __all__ = ['Stream']
@ -133,6 +134,17 @@ def _get_column_index(t, columns):
return c_idx, error return c_idx, error
def _join_rows(rows_grouped, text_y_max, text_y_min):
row_mids = [sum([(t.y0 + t.y1) / 2 for t in r]) / len(r)
if len(r) > 0 else 0 for r in rows_grouped]
rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))]
rows.insert(0, text_y_max)
rows.append(text_y_min)
rows = [(rows[i], rows[i + 1])
for i in range(0, len(rows) - 1)]
return rows
def _add_columns(cols, text, ytolerance): def _add_columns(cols, text, ytolerance):
if text: if text:
text = _group_rows(text, ytol=ytolerance) text = _group_rows(text, ytol=ytolerance)
@ -143,14 +155,6 @@ def _add_columns(cols, text, ytolerance):
return cols return cols
def _get_table_bounds(rows):
x0 = min([t.x0 for r in rows for t in r])
x1 = max([t.x1 for r in rows for t in r])
y0 = min([t.y0 for t in rows[-1]])
y1 = max([t.y1 for t in rows[0]])
return x0, x1, y0, y1
def _join_columns(cols, text_x_min, text_x_max): def _join_columns(cols, text_x_min, text_x_max):
cols = sorted(cols) cols = sorted(cols)
cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))] cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))]
@ -194,16 +198,16 @@ class Stream:
Dictionary with page number as key and list of tables on that Dictionary with page number as key and list of tables on that
page as value. page as value.
""" """
def __init__(self, table_area=None, columns=None, ncolumns=None, ytol=[2],
def __init__(self, ncolumns=0, columns=None, ytol=2, mtol=2, mtol=[2], margins=(2.0, 0.5, 0.1), debug=False):
pdf_margin=(2.0, 0.5, 0.1), debug=False):
self.method = 'stream' self.method = 'stream'
self.ncolumns = ncolumns self.table_area = table_area
self.columns = columns self.columns = columns
self.ncolumns = ncolumns
self.ytol = ytol self.ytol = ytol
self.mtol = mtol self.mtol = mtol
self.char_margin, self.line_margin, self.word_margin = pdf_margin self.char_margin, self.line_margin, self.word_margin = margins
self.debug = debug self.debug = debug
def get_tables(self, pdfname): def get_tables(self, pdfname):
@ -222,106 +226,126 @@ class Stream:
logging.warning("{0}: PDF has no text. It may be an image.".format( logging.warning("{0}: PDF has no text. It may be an image.".format(
os.path.basename(bname))) os.path.basename(bname)))
return None return None
text.sort(key=lambda x: (-x.y0, x.x0))
if self.debug: if self.debug:
self.debug_text = [(t.x0, t.y0, t.x1, t.y1) for t in text] self.debug_text = [(t.x0, t.y0, t.x1, t.y1) for t in text]
return None return None
rows_grouped = _group_rows(text, ytol=self.ytol) if self.table_area:
elements = [len(r) for r in rows_grouped] if self.columns:
row_mids = [sum([(t.y0 + t.y1) / 2 for t in r]) / len(r) if len(self.table_area) != len(self.columns):
if len(r) > 0 else 0 for r in rows_grouped] raise ValueError("message")
rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))]
bounds = _get_table_bounds(rows_grouped)
rows.insert(0, bounds[3])
rows.append(bounds[2])
rows = [(rows[i], rows[i + 1])
for i in range(0, len(rows) - 1)]
guess = False
if self.columns:
# user has to input boundary columns too
# take (0, width) by default
# similar to else condition
# len can't be 1
cols = self.columns.split(',')
cols = [(float(cols[i]), float(cols[i + 1]))
for i in range(0, len(cols) - 1)]
else:
if self.ncolumns: if self.ncolumns:
ncols = self.ncolumns if len(self.table_area) != len(self.ncolumns):
cols = [(t.x0, t.x1) raise ValueError("message")
for r in rows_grouped if len(r) == ncols for t in r] if len(self.ytol) == 1 and self.ytol[0] == 2:
cols = _merge_columns(sorted(cols), mtol=self.mtol) self.ytol = self.ytol * len(self.table_area)
if len(cols) != self.ncolumns: if len(self.mtol) == 1 and self.mtol[0] == 2:
logging.warning("{}: The number of columns after merge" self.mtol = self.mtol * len(self.table_area)
" isn't the same as what you specified." table_bbox = {}
" Change the value of mtol.".format( for area in self.table_area:
os.path.basename(bname))) x1, y1, x2, y2 = area.split(",")
cols = _join_columns(cols, bounds[0], bounds[1]) x1 = int(x1)
else: y1 = int(y1)
guess = True x2 = int(x2)
ncols = max(set(elements), key=elements.count) y2 = int(y2)
len_non_mode = len(filter(lambda x: x != ncols, elements)) table_bbox[(x1, y2, x2, y1)] = None
if ncols == 1 and not self.debug:
# no tables detected
logging.warning("{}: Only one column was detected, the PDF"
" may have no tables. Specify ncols if"
" the PDF has tables.".format(
os.path.basename(bname)))
cols = [(t.x0, t.x1)
for r in rows_grouped if len(r) == ncols for t in r]
cols = _merge_columns(sorted(cols), mtol=self.mtol)
inner_text = []
for i in range(1, len(cols)):
left = cols[i - 1][1]
right = cols[i][0]
inner_text.extend([t for t in text if t.x0 > left and t.x1 < right])
outer_text = [t for t in text if t.x0 > cols[-1][1] or t.x1 < cols[0][0]]
inner_text.extend(outer_text)
cols = _add_columns(cols, inner_text, self.ytol)
cols = _join_columns(cols, bounds[0], bounds[1])
pdf_page = {}
page_tables = {}
table_info = {}
table = Table(cols, rows)
rerror = []
cerror = []
for row in rows_grouped:
for t in row:
try:
r_idx, rass_error = get_row_index(t, rows)
except ValueError as e:
# couldn't assign LTTextLH to any cell
vprint(e.message)
continue
try:
c_idx, cass_error = _get_column_index(t, cols)
except ValueError as e:
# couldn't assign LTTextLH to any cell
vprint(e.message)
continue
rerror.append(rass_error)
cerror.append(cass_error)
table.cells[r_idx][c_idx].add_text(
t.get_text().strip('\n'))
if guess:
score = get_score([[33, rerror], [33, cerror], [34, [len_non_mode / len(elements)]]])
else: else:
score = get_score([[50, rerror], [50, cerror]]) table_bbox = {(0, height, width, 0): None}
table_info['score'] = score
ar = table.get_list()
ar = encode_list(ar)
table_info['data'] = ar
empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)
table_info['empty_p'] = empty_p
table_info['r_nempty_cells'] = r_nempty_cells
table_info['c_nempty_cells'] = c_nempty_cells
table_info['nrows'] = len(ar)
table_info['ncols'] = len(ar[0])
page_tables['table_1'] = table_info
pdf_page[os.path.basename(bname)] = page_tables
return pdf_page page = {}
tables = {}
table_no = 0
# sort tables based on y-coord
for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True):
# select elements which lie within table_bbox
table_data = {}
t_bbox = text_bbox(k, text)
t_bbox.sort(key=lambda x: (-x.y0, x.x0))
rows_grouped = _group_rows(t_bbox, ytol=self.ytol[table_no])
rows = _join_rows(rows_grouped, k[3], k[1])
elements = [len(r) for r in rows_grouped]
guess = False
if self.columns and self.columns[table_no] != "":
# user has to input boundary columns too
# take (0, width) by default
# similar to else condition
# len can't be 1
cols = self.columns[table_no].split(',')
cols = [(float(cols[i]), float(cols[i + 1]))
for i in range(0, len(cols) - 1)]
else:
if self.ncolumns and self.ncolumns[table_no] != -1:
ncols = self.ncolumns[table_no]
cols = [(t.x0, t.x1)
for r in rows_grouped if len(r) == ncols for t in r]
cols = _merge_columns(sorted(cols), mtol=self.mtol[table_no])
if len(cols) != self.ncolumns[table_no]:
logging.warning("{}: The number of columns after merge"
" isn't the same as what you specified."
" Change the value of mtol.".format(
os.path.basename(bname)))
cols = _join_columns(cols, k[0], k[2])
else:
guess = True
ncols = max(set(elements), key=elements.count)
len_non_mode = len(filter(lambda x: x != ncols, elements))
if ncols == 1 and not self.debug:
# no tables detected
logging.warning("{}: Only one column was detected, the PDF"
" may have no tables. Specify ncols if"
" the PDF has tables.".format(
os.path.basename(bname)))
cols = [(t.x0, t.x1)
for r in rows_grouped if len(r) == ncols for t in r]
cols = _merge_columns(sorted(cols), mtol=self.mtol[table_no])
inner_text = []
for i in range(1, len(cols)):
left = cols[i - 1][1]
right = cols[i][0]
inner_text.extend([t for t in text if t.x0 > left and t.x1 < right])
outer_text = [t for t in text if t.x0 > cols[-1][1] or t.x1 < cols[0][0]]
inner_text.extend(outer_text)
cols = _add_columns(cols, inner_text, self.ytol[table_no])
cols = _join_columns(cols, k[0], k[2])
table = Table(cols, rows)
rerror = []
cerror = []
for row in rows_grouped:
for t in row:
try:
r_idx, rass_error = get_row_index(t, rows)
except ValueError as e:
# couldn't assign LTTextLH to any cell
continue
try:
c_idx, cass_error = _get_column_index(t, cols)
except ValueError as e:
# couldn't assign LTTextLH to any cell
continue
rerror.append(rass_error)
cerror.append(cass_error)
table.cells[r_idx][c_idx].add_text(
t.get_text().strip('\n'))
if guess:
score = get_score([[33, rerror], [33, cerror], [34, [len_non_mode / len(elements)]]])
else:
score = get_score([[50, rerror], [50, cerror]])
table_data['score'] = score
ar = encode_list(table.get_list())
table_data['data'] = ar
empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)
table_data['empty_p'] = empty_p
table_data['r_nempty_cells'] = r_nempty_cells
table_data['c_nempty_cells'] = c_nempty_cells
table_data['nrows'] = len(ar)
table_data['ncols'] = len(ar[0])
tables['table-{0}'.format(table_no + 1)] = table_data
table_no += 1
page[os.path.basename(bname)] = tables
return page

View File

@ -81,7 +81,17 @@ def rotate(x1, y1, x2, y2, angle):
return xnew, ynew return xnew, ynew
def transform(tables, v_segments, h_segments, factors): def scale_to_image(k, factors):
x1, y1, x2, y2 = k
scaling_factor_x, scaling_factor_y, pdf_y = factors
x1 = scale(x1, scaling_factor_x)
y1 = scale(abs(translate(-pdf_y, y1)), scaling_factor_y)
x2 = scale(x2, scaling_factor_x)
y2 = scale(abs(translate(-pdf_y, y2)), scaling_factor_y)
return int(x1), int(y1), int(x2), int(y2)
def scale_to_pdf(tables, v_segments, h_segments, factors):
"""Translates and scales OpenCV coordinates to PDFMiner coordinate """Translates and scales OpenCV coordinates to PDFMiner coordinate
space. space.

View File

@ -40,9 +40,9 @@ options:
-W, --wmargin <wmargin> Word margin. Insert blank spaces between chars -W, --wmargin <wmargin> Word margin. Insert blank spaces between chars
if distance between words is greater than word if distance between words is greater than word
margin. [default: 0.1] margin. [default: 0.1]
-S, --save-info Save parsing info for each page to a file. -S, --print-stats List stats on the parsing process.
-T, --save-stats Save stats to a file.
-X, --plot <dist> Plot distributions. (page,all,rc) -X, --plot <dist> Plot distributions. (page,all,rc)
-Z, --summary Summarize metrics.
camelot methods: camelot methods:
lattice Looks for lines between data. lattice Looks for lines between data.
@ -55,19 +55,21 @@ lattice_doc = """
Lattice method looks for lines between text to form a table. Lattice method looks for lines between text to form a table.
usage: usage:
camelot lattice [options] [--] <file> camelot lattice [-t <tarea>...] [-F <fill>...] [-j <jtol>...]
[-m <mtol>...] [options] [--] <file>
options: options:
-t, --tarea <tarea> Specific table areas to analyze.
-F, --fill <fill> Fill data in horizontal and/or vertical spanning -F, --fill <fill> Fill data in horizontal and/or vertical spanning
cells. Example: -F h, -F v, -F hv cells. Example: -F h, -F v, -F hv
-s, --scale <scale> Scaling factor. Large scaling factor leads to
smaller lines being detected. [default: 15]
-i, --invert Invert pdf image to make sure that lines are
in foreground.
-j, --jtol <jtol> Tolerance to account for when comparing joint -j, --jtol <jtol> Tolerance to account for when comparing joint
and line coordinates. [default: 2] and line coordinates. [default: 2]
-m, --mtol <mtol> Tolerance to account for when merging lines -m, --mtol <mtol> Tolerance to account for when merging lines
which are very close. [default: 2] which are very close. [default: 2]
-s, --scale <scale> Scaling factor. Large scaling factor leads to
smaller lines being detected. [default: 15]
-i, --invert Invert pdf image to make sure that lines are
in foreground.
-d, --debug <debug> Debug by visualizing pdf geometry. -d, --debug <debug> Debug by visualizing pdf geometry.
(contour,line,joint,table) Example: -d table (contour,line,joint,table) Example: -d table
""" """
@ -76,12 +78,14 @@ stream_doc = """
Stream method looks for whitespaces between text to form a table. Stream method looks for whitespaces between text to form a table.
usage: usage:
camelot stream [options] [--] <file> camelot stream [-t <tarea>...] [-c <columns>...] [-n <ncols>...] [-y <ytol>...]
[-m <mtol>...] [options] [--] <file>
options: options:
-n, --ncols <ncols> Number of columns. [default: 0] -t, --tarea <tarea> Specific table areas to analyze.
-c, --columns <columns> Comma-separated list of column x-coordinates. -c, --columns <columns> Comma-separated list of column x-coordinates.
Example: -c 10.1,20.2,30.3 Example: -c 10.1,20.2,30.3
-n, --ncols <ncols> Number of columns. [default: -1]
-y, --ytol <ytol> Tolerance to account for when grouping rows -y, --ytol <ytol> Tolerance to account for when grouping rows
together. [default: 2] together. [default: 2]
-m, --mtol <mtol> Tolerance to account for when merging columns -m, --mtol <mtol> Tolerance to account for when merging columns
@ -166,7 +170,7 @@ def plot_rc_piechart(data, output):
plt.savefig(''.join([output, '_rc.png']), dpi=300) plt.savefig(''.join([output, '_rc.png']), dpi=300)
def summary(data, p_time): def print_stats(data, p_time):
from operator import itemgetter from operator import itemgetter
from itertools import groupby from itertools import groupby
@ -331,17 +335,18 @@ if __name__ == '__main__':
else: else:
p.append({'start': int(r), 'end': int(r)}) p.append({'start': int(r), 'end': int(r)})
margin_tuple = (float(args['--cmargin']), float(args['--lmargin']), margins = (float(args['--cmargin']), float(args['--lmargin']),
float(args['--wmargin'])) float(args['--wmargin']))
if args['<method>'] == 'lattice': if args['<method>'] == 'lattice':
try: try:
manager = Pdf(Lattice( manager = Pdf(Lattice(
table_area=args['--tarea'],
fill=args['--fill'], fill=args['--fill'],
jtol=[int(j) for j in args['--jtol']],
mtol=[int(m) for m in args['--mtol']],
scale=int(args['--scale']), scale=int(args['--scale']),
invert=args['--invert'], invert=args['--invert'],
jtol=int(args['--jtol']), margins=margins,
mtol=int(args['--mtol']),
pdf_margin=margin_tuple,
debug=args['--debug']), debug=args['--debug']),
filename, filename,
pagenos=p, pagenos=p,
@ -374,10 +379,10 @@ if __name__ == '__main__':
if 'rc' in plot_type: if 'rc' in plot_type:
plot_rc_piechart(data, pngname) plot_rc_piechart(data, pngname)
if args['--summary']: if args['--print-stats']:
summary(data, processing_time) print_stats(data, processing_time)
if args['--save-info']: if args['--save-stats']:
if args['--output']: if args['--output']:
scorename = os.path.join(args['--output'], os.path.basename(scorename)) scorename = os.path.join(args['--output'], os.path.basename(scorename))
with open(scorename, 'w') as score_file: with open(scorename, 'w') as score_file:
@ -402,11 +407,12 @@ if __name__ == '__main__':
elif args['<method>'] == 'stream': elif args['<method>'] == 'stream':
try: try:
manager = Pdf(Stream( manager = Pdf(Stream(
ncolumns=int(args['--ncols']), table_area=args['--tarea'],
columns=args['--columns'], columns=args['--columns'],
ytol=int(args['--ytol']), ncolumns=[int(nc) for nc in args['--ncols']],
mtol=int(args['--mtol']), ytol=[int(y) for y in args['--ytol']],
pdf_margin=margin_tuple, mtol=[int(m) for m in args['--mtol']],
margins=margins,
debug=args['--debug']), debug=args['--debug']),
filename, filename,
pagenos=p, pagenos=p,
@ -439,10 +445,10 @@ if __name__ == '__main__':
if 'rc' in plot_type: if 'rc' in plot_type:
plot_rc_piechart(data, pngname) plot_rc_piechart(data, pngname)
if args['--summary']: if args['--print-stats']:
summary(data, processing_time) print_stats(data, processing_time)
if args['--save-info']: if args['--save-stats']:
if args['--output']: if args['--output']:
scorename = os.path.join(args['--output'], os.path.basename(scorename)) scorename = os.path.join(args['--output'], os.path.basename(scorename))
with open(scorename, 'w') as score_file: with open(scorename, 'w') as score_file: