parent
0bb6ce0bf9
commit
d86630e70b
|
|
@ -0,0 +1,98 @@
|
|||
import cv2
|
||||
import numpy as np
|
||||
|
||||
|
||||
def adaptive_threshold(imagename, invert=False):
|
||||
img = cv2.imread(imagename)
|
||||
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||||
|
||||
if invert:
|
||||
threshold = cv2.adaptiveThreshold(
|
||||
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY,
|
||||
15, -0.2)
|
||||
else:
|
||||
threshold = cv2.adaptiveThreshold(
|
||||
np.invert(gray), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
||||
cv2.THRESH_BINARY,
|
||||
15, -0.2)
|
||||
return img, threshold
|
||||
|
||||
|
||||
def find_lines(threshold, direction=None, scale=15):
|
||||
lines = []
|
||||
|
||||
if direction == 'vertical':
|
||||
size = threshold.shape[0] // scale
|
||||
el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size))
|
||||
elif direction == 'horizontal':
|
||||
size = threshold.shape[1] // scale
|
||||
el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1))
|
||||
elif direction is None:
|
||||
raise ValueError("Specify direction as either 'vertical' or"
|
||||
" 'horizontal'")
|
||||
|
||||
threshold = cv2.erode(threshold, el, (-1, -1))
|
||||
threshold = cv2.dilate(threshold, el, (-1, -1))
|
||||
|
||||
dmask = threshold # findContours modifies source image
|
||||
|
||||
try:
|
||||
_, contours, _ = cv2.findContours(
|
||||
threshold, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
except ValueError:
|
||||
contours, _ = cv2.findContours(
|
||||
threshold, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
|
||||
for c in contours:
|
||||
x, y, w, h = cv2.boundingRect(c)
|
||||
x1, x2 = x, x + w
|
||||
y1, y2 = y, y + h
|
||||
if direction == 'vertical':
|
||||
lines.append(((x1 + x2) / 2, y2, (x1 + x2) / 2, y1))
|
||||
elif direction == 'horizontal':
|
||||
lines.append((x1, (y1 + y2) / 2, x2, (y1 + y2) / 2))
|
||||
|
||||
return dmask, lines
|
||||
|
||||
|
||||
def find_table_contours(vertical, horizontal):
|
||||
mask = vertical + horizontal
|
||||
|
||||
try:
|
||||
__, contours, __ = cv2.findContours(
|
||||
mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
except ValueError:
|
||||
contours, __ = cv2.findContours(
|
||||
mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
|
||||
|
||||
cont = []
|
||||
for c in contours:
|
||||
c_poly = cv2.approxPolyDP(c, 3, True)
|
||||
x, y, w, h = cv2.boundingRect(c_poly)
|
||||
cont.append((x, y, w, h))
|
||||
return cont
|
||||
|
||||
|
||||
def find_table_joints(contours, vertical, horizontal):
|
||||
joints = np.bitwise_and(vertical, horizontal)
|
||||
tables = {}
|
||||
for c in contours:
|
||||
x, y, w, h = c
|
||||
roi = joints[y : y + h, x : x + w]
|
||||
try:
|
||||
__, jc, __ = cv2.findContours(
|
||||
roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
|
||||
except ValueError:
|
||||
jc, __ = cv2.findContours(
|
||||
roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
|
||||
if len(jc) <= 4: # remove contours with less than 4 joints
|
||||
continue
|
||||
joint_coords = []
|
||||
for j in jc:
|
||||
jx, jy, jw, jh = cv2.boundingRect(j)
|
||||
c1, c2 = x + (2 * jx + jw) / 2, y + (2 * jy + jh) / 2
|
||||
joint_coords.append((c1, c2))
|
||||
tables[(x, y + h, x + w, y)] = joint_coords
|
||||
|
||||
return tables
|
||||
|
|
@ -4,15 +4,15 @@ import types
|
|||
import copy_reg
|
||||
import logging
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
from wand.image import Image
|
||||
|
||||
from .imgproc import (adaptive_threshold, find_lines, find_table_contours,
|
||||
find_table_joints)
|
||||
from .table import Table
|
||||
from .utils import (transform, segments_bbox, text_bbox, detect_vertical, merge_close_values,
|
||||
get_row_index, get_column_index, get_score, reduce_index,
|
||||
outline, fill_spanning, count_empty, encode_list, pdf_to_text)
|
||||
from .utils import (scale_to_pdf, scale_to_image, segments_bbox, text_bbox,
|
||||
detect_vertical, merge_close_values, get_row_index,
|
||||
get_column_index, get_score, reduce_index, outline,
|
||||
fill_spanning, count_empty, encode_list, pdf_to_text)
|
||||
|
||||
|
||||
__all__ = ['Lattice']
|
||||
|
|
@ -26,128 +26,6 @@ def _reduce_method(m):
|
|||
copy_reg.pickle(types.MethodType, _reduce_method)
|
||||
|
||||
|
||||
def _morph_transform(imagename, scale=15, invert=False):
|
||||
"""Morphological Transformation
|
||||
|
||||
Applies a series of morphological operations on the image
|
||||
to find table contours and line segments.
|
||||
http://answers.opencv.org/question/63847/how-to-extract-tables-from-an-image/
|
||||
|
||||
Empirical result for adaptiveThreshold's blockSize=5 and C=-0.2
|
||||
taken from http://pequan.lip6.fr/~bereziat/pima/2012/seuillage/sezgin04.pdf
|
||||
|
||||
Parameters
|
||||
----------
|
||||
imagename : Path to image.
|
||||
|
||||
scale : int
|
||||
Scaling factor. Large scaling factor leads to smaller lines
|
||||
being detected. (optional, default: 15)
|
||||
|
||||
invert : bool
|
||||
Invert pdf image to make sure that lines are in foreground.
|
||||
(optional, default: False)
|
||||
|
||||
Returns
|
||||
-------
|
||||
img : ndarray
|
||||
|
||||
tables : dict
|
||||
Dictionary with table bounding box as key and list of
|
||||
joints found in the table as value.
|
||||
|
||||
v_segments : list
|
||||
List of vertical line segments found in the image.
|
||||
|
||||
h_segments : list
|
||||
List of horizontal line segments found in the image.
|
||||
"""
|
||||
img = cv2.imread(imagename)
|
||||
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||||
|
||||
if invert:
|
||||
threshold = cv2.adaptiveThreshold(
|
||||
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY,
|
||||
15, -0.2)
|
||||
else:
|
||||
threshold = cv2.adaptiveThreshold(
|
||||
np.invert(gray), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
||||
cv2.THRESH_BINARY,
|
||||
15, -0.2)
|
||||
|
||||
vertical = threshold
|
||||
horizontal = threshold
|
||||
|
||||
verticalsize = vertical.shape[0] // scale
|
||||
horizontalsize = horizontal.shape[1] // scale
|
||||
|
||||
ver = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize))
|
||||
hor = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontalsize, 1))
|
||||
|
||||
vertical = cv2.erode(vertical, ver, (-1, -1))
|
||||
vertical = cv2.dilate(vertical, ver, (-1, -1))
|
||||
|
||||
horizontal = cv2.erode(horizontal, hor, (-1, -1))
|
||||
horizontal = cv2.dilate(horizontal, hor, (-1, -1))
|
||||
|
||||
mask = vertical + horizontal
|
||||
joints = np.bitwise_and(vertical, horizontal)
|
||||
try:
|
||||
__, contours, __ = cv2.findContours(
|
||||
mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
except ValueError:
|
||||
contours, __ = cv2.findContours(
|
||||
mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
|
||||
|
||||
tables = {}
|
||||
for c in contours:
|
||||
c_poly = cv2.approxPolyDP(c, 3, True)
|
||||
x, y, w, h = cv2.boundingRect(c_poly)
|
||||
roi = joints[y : y + h, x : x + w]
|
||||
try:
|
||||
__, jc, __ = cv2.findContours(
|
||||
roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
|
||||
except ValueError:
|
||||
jc, __ = cv2.findContours(
|
||||
roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
|
||||
if len(jc) <= 4: # remove contours with less than <=4 joints
|
||||
continue
|
||||
joint_coords = []
|
||||
for j in jc:
|
||||
jx, jy, jw, jh = cv2.boundingRect(j)
|
||||
c1, c2 = x + (2 * jx + jw) / 2, y + (2 * jy + jh) / 2
|
||||
joint_coords.append((c1, c2))
|
||||
tables[(x, y + h, x + w, y)] = joint_coords
|
||||
|
||||
v_segments, h_segments = [], []
|
||||
try:
|
||||
_, vcontours, _ = cv2.findContours(
|
||||
vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
except ValueError:
|
||||
vcontours, _ = cv2.findContours(
|
||||
vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
for vc in vcontours:
|
||||
x, y, w, h = cv2.boundingRect(vc)
|
||||
x1, x2 = x, x + w
|
||||
y1, y2 = y, y + h
|
||||
v_segments.append(((x1 + x2) / 2, y2, (x1 + x2) / 2, y1))
|
||||
|
||||
try:
|
||||
_, hcontours, _ = cv2.findContours(
|
||||
horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
except ValueError:
|
||||
hcontours, _ = cv2.findContours(
|
||||
horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
for hc in hcontours:
|
||||
x, y, w, h = cv2.boundingRect(hc)
|
||||
x1, x2 = x, x + w
|
||||
y1, y2 = y, y + h
|
||||
h_segments.append((x1, (y1 + y2) / 2, x2, (y1 + y2) / 2))
|
||||
|
||||
return img, tables, v_segments, h_segments
|
||||
|
||||
|
||||
class Lattice:
|
||||
"""Lattice algorithm
|
||||
|
||||
|
|
@ -188,17 +66,17 @@ class Lattice:
|
|||
Dictionary with page number as key and list of tables on that
|
||||
page as value.
|
||||
"""
|
||||
|
||||
def __init__(self, fill=None, scale=15, jtol=2, mtol=2,
|
||||
invert=False, pdf_margin=(2.0, 0.5, 0.1), debug=None):
|
||||
def __init__(self, table_area=None, fill=None, jtol=[2], mtol=[2], scale=15,
|
||||
invert=False, margins=(2.0, 0.5, 0.1), debug=None):
|
||||
|
||||
self.method = 'lattice'
|
||||
self.table_area = table_area
|
||||
self.fill = fill
|
||||
self.scale = scale
|
||||
self.jtol = jtol
|
||||
self.mtol = mtol
|
||||
self.scale = scale
|
||||
self.invert = invert
|
||||
self.char_margin, self.line_margin, self.word_margin = pdf_margin
|
||||
self.char_margin, self.line_margin, self.word_margin = margins
|
||||
self.debug = debug
|
||||
|
||||
def get_tables(self, pdfname):
|
||||
|
|
@ -217,48 +95,79 @@ class Lattice:
|
|||
logging.warning("{0}: PDF has no text. It may be an image.".format(
|
||||
os.path.basename(bname)))
|
||||
return None
|
||||
|
||||
imagename = ''.join([bname, '.png'])
|
||||
with Image(filename=pdfname, depth=8, resolution=300) as png:
|
||||
png.save(filename=imagename)
|
||||
|
||||
img, threshold = adaptive_threshold(imagename, invert=self.invert)
|
||||
pdf_x = width
|
||||
pdf_y = height
|
||||
img, table_bbox, v_segments, h_segments = _morph_transform(
|
||||
imagename, scale=self.scale, invert=self.invert)
|
||||
img_x = img.shape[1]
|
||||
img_y = img.shape[0]
|
||||
scaling_factor_x = pdf_x / float(img_x)
|
||||
scaling_factor_y = pdf_y / float(img_y)
|
||||
sc_x_image = img_x / float(pdf_x)
|
||||
sc_y_image = img_y / float(pdf_y)
|
||||
sc_x_pdf = pdf_x / float(img_x)
|
||||
sc_y_pdf = pdf_y / float(img_y)
|
||||
factors_image = (sc_x_image, sc_y_image, pdf_y)
|
||||
factors_pdf = (sc_x_pdf, sc_y_pdf, img_y)
|
||||
|
||||
vmask, v_segments = find_lines(threshold, direction='vertical',
|
||||
scale=self.scale)
|
||||
hmask, h_segments = find_lines(threshold, direction='horizontal',
|
||||
scale=self.scale)
|
||||
|
||||
if self.table_area:
|
||||
if self.fill:
|
||||
if len(self.table_area) != len(self.fill):
|
||||
raise ValueError("message")
|
||||
if len(self.jtol) == 1 and self.jtol[0] == 2:
|
||||
self.jtol = self.jtol * len(self.table_area)
|
||||
if len(self.mtol) == 1 and self.mtol[0] == 2:
|
||||
self.mtol = self.mtol * len(self.table_area)
|
||||
areas = []
|
||||
for area in self.table_area:
|
||||
x1, y1, x2, y2 = area.split(",")
|
||||
x1 = int(x1)
|
||||
y1 = int(y1)
|
||||
x2 = int(x2)
|
||||
y2 = int(y2)
|
||||
x1, y1, x2, y2 = scale_to_image((x1, y1, x2, y2), factors_image)
|
||||
areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1)))
|
||||
table_bbox = find_table_joints(areas, vmask, hmask)
|
||||
else:
|
||||
contours = find_table_contours(vmask, hmask)
|
||||
table_bbox = find_table_joints(contours, vmask, hmask)
|
||||
|
||||
if self.debug:
|
||||
self.debug_images = (img, table_bbox)
|
||||
|
||||
factors = (scaling_factor_x, scaling_factor_y, img_y)
|
||||
table_bbox, v_segments, h_segments = transform(table_bbox, v_segments,
|
||||
h_segments, factors)
|
||||
table_bbox, v_segments, h_segments = scale_to_pdf(table_bbox, v_segments,
|
||||
h_segments, factors_pdf)
|
||||
|
||||
if self.debug:
|
||||
self.debug_segments = (v_segments, h_segments)
|
||||
self.debug_tables = []
|
||||
|
||||
pdf_page = {}
|
||||
page_tables = {}
|
||||
table_no = 1
|
||||
page = {}
|
||||
tables = {}
|
||||
table_no = 0
|
||||
# sort tables based on y-coord
|
||||
for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True):
|
||||
# select edges which lie within table_bbox
|
||||
table_info = {}
|
||||
# select elements which lie within table_bbox
|
||||
table_data = {}
|
||||
v_s, h_s = segments_bbox(k, v_segments, h_segments)
|
||||
t_bbox = text_bbox(k, text)
|
||||
table_info['text_p'] = 100 * (1 - (len(t_bbox) / len(text)))
|
||||
table_data['text_p'] = 100 * (1 - (len(t_bbox) / len(text)))
|
||||
table_rotation = detect_vertical(t_bbox)
|
||||
cols, rows = zip(*table_bbox[k])
|
||||
cols, rows = list(cols), list(rows)
|
||||
cols.extend([k[0], k[2]])
|
||||
rows.extend([k[1], k[3]])
|
||||
# sort horizontal and vertical segments
|
||||
cols = merge_close_values(sorted(cols), mtol=self.mtol)
|
||||
cols = merge_close_values(sorted(cols), mtol=self.mtol[table_no])
|
||||
rows = merge_close_values(
|
||||
sorted(rows, reverse=True), mtol=self.mtol)
|
||||
sorted(rows, reverse=True), mtol=self.mtol[table_no])
|
||||
# make grid using x and y coord of shortlisted rows and cols
|
||||
cols = [(cols[i], cols[i + 1])
|
||||
for i in range(0, len(cols) - 1)]
|
||||
|
|
@ -266,9 +175,9 @@ class Lattice:
|
|||
for i in range(0, len(rows) - 1)]
|
||||
table = Table(cols, rows)
|
||||
# set table edges to True using ver+hor lines
|
||||
table = table.set_edges(v_s, h_s, jtol=self.jtol)
|
||||
table = table.set_edges(v_s, h_s, jtol=self.jtol[table_no])
|
||||
nouse = table.nocont_ / (len(v_s) + len(h_s))
|
||||
table_info['line_p'] = 100 * (1 - nouse)
|
||||
table_data['line_p'] = 100 * (1 - nouse)
|
||||
# set spanning cells to True
|
||||
table = table.set_spanning()
|
||||
# set table border edges to True
|
||||
|
|
@ -314,10 +223,10 @@ class Lattice:
|
|||
for t in t_bbox]))
|
||||
|
||||
score = get_score([[50, rerror], [50, cerror]])
|
||||
table_info['score'] = score
|
||||
table_data['score'] = score
|
||||
|
||||
if self.fill is not None:
|
||||
table = fill_spanning(table, fill=self.fill)
|
||||
if self.fill:
|
||||
table = fill_spanning(table, fill=self.fill[table_no])
|
||||
ar = table.get_list()
|
||||
if table_rotation == 'left':
|
||||
ar = zip(*ar[::-1])
|
||||
|
|
@ -325,18 +234,18 @@ class Lattice:
|
|||
ar = zip(*ar[::1])
|
||||
ar.reverse()
|
||||
ar = encode_list(ar)
|
||||
table_info['data'] = ar
|
||||
table_data['data'] = ar
|
||||
empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)
|
||||
table_info['empty_p'] = empty_p
|
||||
table_info['r_nempty_cells'] = r_nempty_cells
|
||||
table_info['c_nempty_cells'] = c_nempty_cells
|
||||
table_info['nrows'] = len(ar)
|
||||
table_info['ncols'] = len(ar[0])
|
||||
page_tables['table_{0}'.format(table_no)] = table_info
|
||||
table_data['empty_p'] = empty_p
|
||||
table_data['r_nempty_cells'] = r_nempty_cells
|
||||
table_data['c_nempty_cells'] = c_nempty_cells
|
||||
table_data['nrows'] = len(ar)
|
||||
table_data['ncols'] = len(ar[0])
|
||||
tables['table-{0}'.format(table_no + 1)] = table_data
|
||||
table_no += 1
|
||||
pdf_page[os.path.basename(bname)] = page_tables
|
||||
page[os.path.basename(bname)] = tables
|
||||
|
||||
if self.debug:
|
||||
return None
|
||||
|
||||
return pdf_page
|
||||
return page
|
||||
|
|
@ -7,7 +7,8 @@ import logging
|
|||
import numpy as np
|
||||
|
||||
from .table import Table
|
||||
from .utils import get_row_index, get_score, count_empty, encode_list, pdf_to_text
|
||||
from .utils import (get_row_index, get_score, count_empty, encode_list,
|
||||
pdf_to_text, text_bbox)
|
||||
|
||||
|
||||
__all__ = ['Stream']
|
||||
|
|
@ -133,6 +134,17 @@ def _get_column_index(t, columns):
|
|||
return c_idx, error
|
||||
|
||||
|
||||
def _join_rows(rows_grouped, text_y_max, text_y_min):
|
||||
row_mids = [sum([(t.y0 + t.y1) / 2 for t in r]) / len(r)
|
||||
if len(r) > 0 else 0 for r in rows_grouped]
|
||||
rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))]
|
||||
rows.insert(0, text_y_max)
|
||||
rows.append(text_y_min)
|
||||
rows = [(rows[i], rows[i + 1])
|
||||
for i in range(0, len(rows) - 1)]
|
||||
return rows
|
||||
|
||||
|
||||
def _add_columns(cols, text, ytolerance):
|
||||
if text:
|
||||
text = _group_rows(text, ytol=ytolerance)
|
||||
|
|
@ -143,14 +155,6 @@ def _add_columns(cols, text, ytolerance):
|
|||
return cols
|
||||
|
||||
|
||||
def _get_table_bounds(rows):
|
||||
x0 = min([t.x0 for r in rows for t in r])
|
||||
x1 = max([t.x1 for r in rows for t in r])
|
||||
y0 = min([t.y0 for t in rows[-1]])
|
||||
y1 = max([t.y1 for t in rows[0]])
|
||||
return x0, x1, y0, y1
|
||||
|
||||
|
||||
def _join_columns(cols, text_x_min, text_x_max):
|
||||
cols = sorted(cols)
|
||||
cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))]
|
||||
|
|
@ -194,16 +198,16 @@ class Stream:
|
|||
Dictionary with page number as key and list of tables on that
|
||||
page as value.
|
||||
"""
|
||||
|
||||
def __init__(self, ncolumns=0, columns=None, ytol=2, mtol=2,
|
||||
pdf_margin=(2.0, 0.5, 0.1), debug=False):
|
||||
def __init__(self, table_area=None, columns=None, ncolumns=None, ytol=[2],
|
||||
mtol=[2], margins=(2.0, 0.5, 0.1), debug=False):
|
||||
|
||||
self.method = 'stream'
|
||||
self.ncolumns = ncolumns
|
||||
self.table_area = table_area
|
||||
self.columns = columns
|
||||
self.ncolumns = ncolumns
|
||||
self.ytol = ytol
|
||||
self.mtol = mtol
|
||||
self.char_margin, self.line_margin, self.word_margin = pdf_margin
|
||||
self.char_margin, self.line_margin, self.word_margin = margins
|
||||
self.debug = debug
|
||||
|
||||
def get_tables(self, pdfname):
|
||||
|
|
@ -222,106 +226,126 @@ class Stream:
|
|||
logging.warning("{0}: PDF has no text. It may be an image.".format(
|
||||
os.path.basename(bname)))
|
||||
return None
|
||||
text.sort(key=lambda x: (-x.y0, x.x0))
|
||||
|
||||
if self.debug:
|
||||
self.debug_text = [(t.x0, t.y0, t.x1, t.y1) for t in text]
|
||||
return None
|
||||
|
||||
rows_grouped = _group_rows(text, ytol=self.ytol)
|
||||
elements = [len(r) for r in rows_grouped]
|
||||
row_mids = [sum([(t.y0 + t.y1) / 2 for t in r]) / len(r)
|
||||
if len(r) > 0 else 0 for r in rows_grouped]
|
||||
rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))]
|
||||
bounds = _get_table_bounds(rows_grouped)
|
||||
rows.insert(0, bounds[3])
|
||||
rows.append(bounds[2])
|
||||
rows = [(rows[i], rows[i + 1])
|
||||
for i in range(0, len(rows) - 1)]
|
||||
|
||||
guess = False
|
||||
if self.columns:
|
||||
# user has to input boundary columns too
|
||||
# take (0, width) by default
|
||||
# similar to else condition
|
||||
# len can't be 1
|
||||
cols = self.columns.split(',')
|
||||
cols = [(float(cols[i]), float(cols[i + 1]))
|
||||
for i in range(0, len(cols) - 1)]
|
||||
else:
|
||||
if self.table_area:
|
||||
if self.columns:
|
||||
if len(self.table_area) != len(self.columns):
|
||||
raise ValueError("message")
|
||||
if self.ncolumns:
|
||||
ncols = self.ncolumns
|
||||
cols = [(t.x0, t.x1)
|
||||
for r in rows_grouped if len(r) == ncols for t in r]
|
||||
cols = _merge_columns(sorted(cols), mtol=self.mtol)
|
||||
if len(cols) != self.ncolumns:
|
||||
logging.warning("{}: The number of columns after merge"
|
||||
" isn't the same as what you specified."
|
||||
" Change the value of mtol.".format(
|
||||
os.path.basename(bname)))
|
||||
cols = _join_columns(cols, bounds[0], bounds[1])
|
||||
else:
|
||||
guess = True
|
||||
ncols = max(set(elements), key=elements.count)
|
||||
len_non_mode = len(filter(lambda x: x != ncols, elements))
|
||||
if ncols == 1 and not self.debug:
|
||||
# no tables detected
|
||||
logging.warning("{}: Only one column was detected, the PDF"
|
||||
" may have no tables. Specify ncols if"
|
||||
" the PDF has tables.".format(
|
||||
os.path.basename(bname)))
|
||||
cols = [(t.x0, t.x1)
|
||||
for r in rows_grouped if len(r) == ncols for t in r]
|
||||
cols = _merge_columns(sorted(cols), mtol=self.mtol)
|
||||
inner_text = []
|
||||
for i in range(1, len(cols)):
|
||||
left = cols[i - 1][1]
|
||||
right = cols[i][0]
|
||||
inner_text.extend([t for t in text if t.x0 > left and t.x1 < right])
|
||||
outer_text = [t for t in text if t.x0 > cols[-1][1] or t.x1 < cols[0][0]]
|
||||
inner_text.extend(outer_text)
|
||||
cols = _add_columns(cols, inner_text, self.ytol)
|
||||
cols = _join_columns(cols, bounds[0], bounds[1])
|
||||
|
||||
pdf_page = {}
|
||||
page_tables = {}
|
||||
table_info = {}
|
||||
table = Table(cols, rows)
|
||||
rerror = []
|
||||
cerror = []
|
||||
for row in rows_grouped:
|
||||
for t in row:
|
||||
try:
|
||||
r_idx, rass_error = get_row_index(t, rows)
|
||||
except ValueError as e:
|
||||
# couldn't assign LTTextLH to any cell
|
||||
vprint(e.message)
|
||||
continue
|
||||
try:
|
||||
c_idx, cass_error = _get_column_index(t, cols)
|
||||
except ValueError as e:
|
||||
# couldn't assign LTTextLH to any cell
|
||||
vprint(e.message)
|
||||
continue
|
||||
rerror.append(rass_error)
|
||||
cerror.append(cass_error)
|
||||
table.cells[r_idx][c_idx].add_text(
|
||||
t.get_text().strip('\n'))
|
||||
if guess:
|
||||
score = get_score([[33, rerror], [33, cerror], [34, [len_non_mode / len(elements)]]])
|
||||
if len(self.table_area) != len(self.ncolumns):
|
||||
raise ValueError("message")
|
||||
if len(self.ytol) == 1 and self.ytol[0] == 2:
|
||||
self.ytol = self.ytol * len(self.table_area)
|
||||
if len(self.mtol) == 1 and self.mtol[0] == 2:
|
||||
self.mtol = self.mtol * len(self.table_area)
|
||||
table_bbox = {}
|
||||
for area in self.table_area:
|
||||
x1, y1, x2, y2 = area.split(",")
|
||||
x1 = int(x1)
|
||||
y1 = int(y1)
|
||||
x2 = int(x2)
|
||||
y2 = int(y2)
|
||||
table_bbox[(x1, y2, x2, y1)] = None
|
||||
else:
|
||||
score = get_score([[50, rerror], [50, cerror]])
|
||||
table_info['score'] = score
|
||||
ar = table.get_list()
|
||||
ar = encode_list(ar)
|
||||
table_info['data'] = ar
|
||||
empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)
|
||||
table_info['empty_p'] = empty_p
|
||||
table_info['r_nempty_cells'] = r_nempty_cells
|
||||
table_info['c_nempty_cells'] = c_nempty_cells
|
||||
table_info['nrows'] = len(ar)
|
||||
table_info['ncols'] = len(ar[0])
|
||||
page_tables['table_1'] = table_info
|
||||
pdf_page[os.path.basename(bname)] = page_tables
|
||||
table_bbox = {(0, height, width, 0): None}
|
||||
|
||||
return pdf_page
|
||||
page = {}
|
||||
tables = {}
|
||||
table_no = 0
|
||||
# sort tables based on y-coord
|
||||
for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True):
|
||||
# select elements which lie within table_bbox
|
||||
table_data = {}
|
||||
t_bbox = text_bbox(k, text)
|
||||
t_bbox.sort(key=lambda x: (-x.y0, x.x0))
|
||||
|
||||
rows_grouped = _group_rows(t_bbox, ytol=self.ytol[table_no])
|
||||
rows = _join_rows(rows_grouped, k[3], k[1])
|
||||
elements = [len(r) for r in rows_grouped]
|
||||
|
||||
guess = False
|
||||
if self.columns and self.columns[table_no] != "":
|
||||
# user has to input boundary columns too
|
||||
# take (0, width) by default
|
||||
# similar to else condition
|
||||
# len can't be 1
|
||||
cols = self.columns[table_no].split(',')
|
||||
cols = [(float(cols[i]), float(cols[i + 1]))
|
||||
for i in range(0, len(cols) - 1)]
|
||||
else:
|
||||
if self.ncolumns and self.ncolumns[table_no] != -1:
|
||||
ncols = self.ncolumns[table_no]
|
||||
cols = [(t.x0, t.x1)
|
||||
for r in rows_grouped if len(r) == ncols for t in r]
|
||||
cols = _merge_columns(sorted(cols), mtol=self.mtol[table_no])
|
||||
if len(cols) != self.ncolumns[table_no]:
|
||||
logging.warning("{}: The number of columns after merge"
|
||||
" isn't the same as what you specified."
|
||||
" Change the value of mtol.".format(
|
||||
os.path.basename(bname)))
|
||||
cols = _join_columns(cols, k[0], k[2])
|
||||
else:
|
||||
guess = True
|
||||
ncols = max(set(elements), key=elements.count)
|
||||
len_non_mode = len(filter(lambda x: x != ncols, elements))
|
||||
if ncols == 1 and not self.debug:
|
||||
# no tables detected
|
||||
logging.warning("{}: Only one column was detected, the PDF"
|
||||
" may have no tables. Specify ncols if"
|
||||
" the PDF has tables.".format(
|
||||
os.path.basename(bname)))
|
||||
cols = [(t.x0, t.x1)
|
||||
for r in rows_grouped if len(r) == ncols for t in r]
|
||||
cols = _merge_columns(sorted(cols), mtol=self.mtol[table_no])
|
||||
inner_text = []
|
||||
for i in range(1, len(cols)):
|
||||
left = cols[i - 1][1]
|
||||
right = cols[i][0]
|
||||
inner_text.extend([t for t in text if t.x0 > left and t.x1 < right])
|
||||
outer_text = [t for t in text if t.x0 > cols[-1][1] or t.x1 < cols[0][0]]
|
||||
inner_text.extend(outer_text)
|
||||
cols = _add_columns(cols, inner_text, self.ytol[table_no])
|
||||
cols = _join_columns(cols, k[0], k[2])
|
||||
|
||||
table = Table(cols, rows)
|
||||
rerror = []
|
||||
cerror = []
|
||||
for row in rows_grouped:
|
||||
for t in row:
|
||||
try:
|
||||
r_idx, rass_error = get_row_index(t, rows)
|
||||
except ValueError as e:
|
||||
# couldn't assign LTTextLH to any cell
|
||||
continue
|
||||
try:
|
||||
c_idx, cass_error = _get_column_index(t, cols)
|
||||
except ValueError as e:
|
||||
# couldn't assign LTTextLH to any cell
|
||||
continue
|
||||
rerror.append(rass_error)
|
||||
cerror.append(cass_error)
|
||||
table.cells[r_idx][c_idx].add_text(
|
||||
t.get_text().strip('\n'))
|
||||
if guess:
|
||||
score = get_score([[33, rerror], [33, cerror], [34, [len_non_mode / len(elements)]]])
|
||||
else:
|
||||
score = get_score([[50, rerror], [50, cerror]])
|
||||
|
||||
table_data['score'] = score
|
||||
ar = encode_list(table.get_list())
|
||||
table_data['data'] = ar
|
||||
empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)
|
||||
table_data['empty_p'] = empty_p
|
||||
table_data['r_nempty_cells'] = r_nempty_cells
|
||||
table_data['c_nempty_cells'] = c_nempty_cells
|
||||
table_data['nrows'] = len(ar)
|
||||
table_data['ncols'] = len(ar[0])
|
||||
tables['table-{0}'.format(table_no + 1)] = table_data
|
||||
table_no += 1
|
||||
page[os.path.basename(bname)] = tables
|
||||
|
||||
return page
|
||||
|
|
@ -81,7 +81,17 @@ def rotate(x1, y1, x2, y2, angle):
|
|||
return xnew, ynew
|
||||
|
||||
|
||||
def transform(tables, v_segments, h_segments, factors):
|
||||
def scale_to_image(k, factors):
|
||||
x1, y1, x2, y2 = k
|
||||
scaling_factor_x, scaling_factor_y, pdf_y = factors
|
||||
x1 = scale(x1, scaling_factor_x)
|
||||
y1 = scale(abs(translate(-pdf_y, y1)), scaling_factor_y)
|
||||
x2 = scale(x2, scaling_factor_x)
|
||||
y2 = scale(abs(translate(-pdf_y, y2)), scaling_factor_y)
|
||||
return int(x1), int(y1), int(x2), int(y2)
|
||||
|
||||
|
||||
def scale_to_pdf(tables, v_segments, h_segments, factors):
|
||||
"""Translates and scales OpenCV coordinates to PDFMiner coordinate
|
||||
space.
|
||||
|
||||
|
|
|
|||
|
|
@ -40,9 +40,9 @@ options:
|
|||
-W, --wmargin <wmargin> Word margin. Insert blank spaces between chars
|
||||
if distance between words is greater than word
|
||||
margin. [default: 0.1]
|
||||
-S, --save-info Save parsing info for each page to a file.
|
||||
-S, --print-stats List stats on the parsing process.
|
||||
-T, --save-stats Save stats to a file.
|
||||
-X, --plot <dist> Plot distributions. (page,all,rc)
|
||||
-Z, --summary Summarize metrics.
|
||||
|
||||
camelot methods:
|
||||
lattice Looks for lines between data.
|
||||
|
|
@ -55,19 +55,21 @@ lattice_doc = """
|
|||
Lattice method looks for lines between text to form a table.
|
||||
|
||||
usage:
|
||||
camelot lattice [options] [--] <file>
|
||||
camelot lattice [-t <tarea>...] [-F <fill>...] [-j <jtol>...]
|
||||
[-m <mtol>...] [options] [--] <file>
|
||||
|
||||
options:
|
||||
-t, --tarea <tarea> Specific table areas to analyze.
|
||||
-F, --fill <fill> Fill data in horizontal and/or vertical spanning
|
||||
cells. Example: -F h, -F v, -F hv
|
||||
-s, --scale <scale> Scaling factor. Large scaling factor leads to
|
||||
smaller lines being detected. [default: 15]
|
||||
-i, --invert Invert pdf image to make sure that lines are
|
||||
in foreground.
|
||||
-j, --jtol <jtol> Tolerance to account for when comparing joint
|
||||
and line coordinates. [default: 2]
|
||||
-m, --mtol <mtol> Tolerance to account for when merging lines
|
||||
which are very close. [default: 2]
|
||||
-s, --scale <scale> Scaling factor. Large scaling factor leads to
|
||||
smaller lines being detected. [default: 15]
|
||||
-i, --invert Invert pdf image to make sure that lines are
|
||||
in foreground.
|
||||
-d, --debug <debug> Debug by visualizing pdf geometry.
|
||||
(contour,line,joint,table) Example: -d table
|
||||
"""
|
||||
|
|
@ -76,12 +78,14 @@ stream_doc = """
|
|||
Stream method looks for whitespaces between text to form a table.
|
||||
|
||||
usage:
|
||||
camelot stream [options] [--] <file>
|
||||
camelot stream [-t <tarea>...] [-c <columns>...] [-n <ncols>...] [-y <ytol>...]
|
||||
[-m <mtol>...] [options] [--] <file>
|
||||
|
||||
options:
|
||||
-n, --ncols <ncols> Number of columns. [default: 0]
|
||||
-t, --tarea <tarea> Specific table areas to analyze.
|
||||
-c, --columns <columns> Comma-separated list of column x-coordinates.
|
||||
Example: -c 10.1,20.2,30.3
|
||||
-n, --ncols <ncols> Number of columns. [default: -1]
|
||||
-y, --ytol <ytol> Tolerance to account for when grouping rows
|
||||
together. [default: 2]
|
||||
-m, --mtol <mtol> Tolerance to account for when merging columns
|
||||
|
|
@ -166,7 +170,7 @@ def plot_rc_piechart(data, output):
|
|||
plt.savefig(''.join([output, '_rc.png']), dpi=300)
|
||||
|
||||
|
||||
def summary(data, p_time):
|
||||
def print_stats(data, p_time):
|
||||
from operator import itemgetter
|
||||
from itertools import groupby
|
||||
|
||||
|
|
@ -331,17 +335,18 @@ if __name__ == '__main__':
|
|||
else:
|
||||
p.append({'start': int(r), 'end': int(r)})
|
||||
|
||||
margin_tuple = (float(args['--cmargin']), float(args['--lmargin']),
|
||||
margins = (float(args['--cmargin']), float(args['--lmargin']),
|
||||
float(args['--wmargin']))
|
||||
if args['<method>'] == 'lattice':
|
||||
try:
|
||||
manager = Pdf(Lattice(
|
||||
table_area=args['--tarea'],
|
||||
fill=args['--fill'],
|
||||
jtol=[int(j) for j in args['--jtol']],
|
||||
mtol=[int(m) for m in args['--mtol']],
|
||||
scale=int(args['--scale']),
|
||||
invert=args['--invert'],
|
||||
jtol=int(args['--jtol']),
|
||||
mtol=int(args['--mtol']),
|
||||
pdf_margin=margin_tuple,
|
||||
margins=margins,
|
||||
debug=args['--debug']),
|
||||
filename,
|
||||
pagenos=p,
|
||||
|
|
@ -374,10 +379,10 @@ if __name__ == '__main__':
|
|||
if 'rc' in plot_type:
|
||||
plot_rc_piechart(data, pngname)
|
||||
|
||||
if args['--summary']:
|
||||
summary(data, processing_time)
|
||||
if args['--print-stats']:
|
||||
print_stats(data, processing_time)
|
||||
|
||||
if args['--save-info']:
|
||||
if args['--save-stats']:
|
||||
if args['--output']:
|
||||
scorename = os.path.join(args['--output'], os.path.basename(scorename))
|
||||
with open(scorename, 'w') as score_file:
|
||||
|
|
@ -402,11 +407,12 @@ if __name__ == '__main__':
|
|||
elif args['<method>'] == 'stream':
|
||||
try:
|
||||
manager = Pdf(Stream(
|
||||
ncolumns=int(args['--ncols']),
|
||||
table_area=args['--tarea'],
|
||||
columns=args['--columns'],
|
||||
ytol=int(args['--ytol']),
|
||||
mtol=int(args['--mtol']),
|
||||
pdf_margin=margin_tuple,
|
||||
ncolumns=[int(nc) for nc in args['--ncols']],
|
||||
ytol=[int(y) for y in args['--ytol']],
|
||||
mtol=[int(m) for m in args['--mtol']],
|
||||
margins=margins,
|
||||
debug=args['--debug']),
|
||||
filename,
|
||||
pagenos=p,
|
||||
|
|
@ -439,10 +445,10 @@ if __name__ == '__main__':
|
|||
if 'rc' in plot_type:
|
||||
plot_rc_piechart(data, pngname)
|
||||
|
||||
if args['--summary']:
|
||||
summary(data, processing_time)
|
||||
if args['--print-stats']:
|
||||
print_stats(data, processing_time)
|
||||
|
||||
if args['--save-info']:
|
||||
if args['--save-stats']:
|
||||
if args['--output']:
|
||||
scorename = os.path.join(args['--output'], os.path.basename(scorename))
|
||||
with open(scorename, 'w') as score_file:
|
||||
|
|
|
|||
Loading…
Reference in New Issue