Add various metrics to score the quality of a parse

Add various metrics to score the quality of a parse
pull/2/head
Vinayak Mehta 2016-08-30 14:52:49 +05:30 committed by GitHub
parent 43a009dab4
commit 552f9cf422
11 changed files with 1027 additions and 472 deletions

View File

@ -1,18 +1,31 @@
from __future__ import print_function
from __future__ import division
import os
import types
import copy_reg
import logging
import cv2
import numpy as np
from wand.image import Image
from .table import Table
from .utils import (transform, elements_bbox, detect_vertical, merge_close_values,
get_row_index, get_column_index, reduce_index, outline,
fill_spanning, remove_empty, encode_list)
get_row_index, get_column_index, get_score, reduce_index,
outline, fill_spanning, count_empty, encode_list, pdf_to_text)
__all__ = ['Lattice']
def _reduce_method(m):
if m.im_self is None:
return getattr, (m.im_class, m.im_func.func_name)
else:
return getattr, (m.im_self, m.im_func.func_name)
copy_reg.pickle(types.MethodType, _reduce_method)
def _morph_transform(imagename, scale=15, invert=False):
"""Morphological Transformation
@ -65,8 +78,8 @@ def _morph_transform(imagename, scale=15, invert=False):
vertical = threshold
horizontal = threshold
verticalsize = vertical.shape[0] / scale
horizontalsize = horizontal.shape[1] / scale
verticalsize = vertical.shape[0] // scale
horizontalsize = horizontal.shape[1] // scale
ver = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize))
hor = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontalsize, 1))
@ -79,8 +92,12 @@ def _morph_transform(imagename, scale=15, invert=False):
mask = vertical + horizontal
joints = np.bitwise_and(vertical, horizontal)
__, contours, __ = cv2.findContours(
mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
try:
__, contours, __ = cv2.findContours(
mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
except ValueError:
contours, __ = cv2.findContours(
mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
tables = {}
@ -88,8 +105,12 @@ def _morph_transform(imagename, scale=15, invert=False):
c_poly = cv2.approxPolyDP(c, 3, True)
x, y, w, h = cv2.boundingRect(c_poly)
roi = joints[y : y + h, x : x + w]
__, jc, __ = cv2.findContours(
roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
try:
__, jc, __ = cv2.findContours(
roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
except ValueError:
jc, __ = cv2.findContours(
roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
if len(jc) <= 4: # remove contours with less than <=4 joints
continue
joint_coords = []
@ -100,16 +121,24 @@ def _morph_transform(imagename, scale=15, invert=False):
tables[(x, y + h, x + w, y)] = joint_coords
v_segments, h_segments = [], []
_, vcontours, _ = cv2.findContours(
vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
try:
_, vcontours, _ = cv2.findContours(
vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
except ValueError:
vcontours, _ = cv2.findContours(
vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
for vc in vcontours:
x, y, w, h = cv2.boundingRect(vc)
x1, x2 = x, x + w
y1, y2 = y, y + h
v_segments.append(((x1 + x2) / 2, y2, (x1 + x2) / 2, y1))
_, hcontours, _ = cv2.findContours(
horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
try:
_, hcontours, _ = cv2.findContours(
horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
except ValueError:
hcontours, _ = cv2.findContours(
horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
for hc in hcontours:
x, y, w, h = cv2.boundingRect(hc)
x1, x2 = x, x + w
@ -160,24 +189,19 @@ class Lattice:
page as value.
"""
def __init__(self, pdfobject, fill=None, scale=15, jtol=2, mtol=2,
invert=False, debug=None, verbose=False):
def __init__(self, fill=None, scale=15, jtol=2, mtol=2,
invert=False, pdf_margin=(2.0, 0.5, 0.1), debug=None):
self.pdfobject = pdfobject
self.method = 'lattice'
self.fill = fill
self.scale = scale
self.jtol = jtol
self.mtol = mtol
self.invert = invert
self.char_margin, self.line_margin, self.word_margin = pdf_margin
self.debug = debug
self.verbose = verbose
self.tables = {}
if self.debug is not None:
self.debug_images = {}
self.debug_segments = {}
self.debug_tables = {}
def get_tables(self):
def get_tables(self, pdfname):
"""Returns all tables found in given pdf.
Returns
@ -186,169 +210,124 @@ class Lattice:
Dictionary with page number as key and list of tables on that
page as value.
"""
vprint = print if self.verbose else lambda *a, **k: None
self.pdfobject.split()
self.pdfobject.convert()
for page in self.pdfobject.extract():
p, text, __, width, height = page
pkey = 'pg-{0}'.format(p)
imagename = os.path.join(
self.pdfobject.temp, '{}.png'.format(pkey))
pdf_x = width
pdf_y = height
img, table_bbox, v_segments, h_segments = _morph_transform(
imagename, scale=self.scale, invert=self.invert)
img_x = img.shape[1]
img_y = img.shape[0]
scaling_factor_x = pdf_x / float(img_x)
scaling_factor_y = pdf_y / float(img_y)
text, __, width, height = pdf_to_text(pdfname, self.char_margin,
self.line_margin, self.word_margin)
bname, __ = os.path.splitext(pdfname)
if not text:
logging.warning("{0}: PDF has no text. It may be an image.".format(
os.path.basename(bname)))
return None
imagename = ''.join([bname, '.png'])
with Image(filename=pdfname, depth=8, resolution=300) as png:
png.save(filename=imagename)
pdf_x = width
pdf_y = height
img, table_bbox, v_segments, h_segments = _morph_transform(
imagename, scale=self.scale, invert=self.invert)
img_x = img.shape[1]
img_y = img.shape[0]
scaling_factor_x = pdf_x / float(img_x)
scaling_factor_y = pdf_y / float(img_y)
if self.debug is not None:
self.debug_images[pkey] = (img, table_bbox)
if self.debug:
self.debug_images = (img, table_bbox)
factors = (scaling_factor_x, scaling_factor_y, img_y)
table_bbox, v_segments, h_segments = transform(table_bbox, v_segments,
h_segments, factors)
factors = (scaling_factor_x, scaling_factor_y, img_y)
table_bbox, v_segments, h_segments = transform(table_bbox, v_segments,
h_segments, factors)
if self.debug is not None:
self.debug_segments[pkey] = (v_segments, h_segments)
if self.debug:
self.debug_segments = (v_segments, h_segments)
self.debug_tables = []
if self.debug is not None:
debug_page_tables = []
page_tables = []
# sort tables based on y-coord
for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True):
# select edges which lie within table_bbox
text_bbox, v_s, h_s = elements_bbox(k, text, v_segments,
h_segments)
rotated = detect_vertical(text_bbox)
cols, rows = zip(*table_bbox[k])
cols, rows = list(cols), list(rows)
cols.extend([k[0], k[2]])
rows.extend([k[1], k[3]])
# sort horizontal and vertical segments
cols = merge_close_values(sorted(cols), mtol=self.mtol)
rows = merge_close_values(
sorted(rows, reverse=True), mtol=self.mtol)
# make grid using x and y coord of shortlisted rows and cols
cols = [(cols[i], cols[i + 1])
for i in range(0, len(cols) - 1)]
rows = [(rows[i], rows[i + 1])
for i in range(0, len(rows) - 1)]
table = Table(cols, rows)
# set table edges to True using ver+hor lines
table = table.set_edges(v_s, h_s, jtol=self.jtol)
# set spanning cells to True
table = table.set_spanning()
# set table border edges to True
table = outline(table)
pdf_page = {}
page_tables = {}
table_no = 1
# sort tables based on y-coord
for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True):
# select edges which lie within table_bbox
table_info = {}
text_bbox, v_s, h_s = elements_bbox(k, text, v_segments,
h_segments)
table_info['text_p'] = 100 * (1 - (len(text_bbox) / len(text)))
rotated = detect_vertical(text_bbox)
cols, rows = zip(*table_bbox[k])
cols, rows = list(cols), list(rows)
cols.extend([k[0], k[2]])
rows.extend([k[1], k[3]])
# sort horizontal and vertical segments
cols = merge_close_values(sorted(cols), mtol=self.mtol)
rows = merge_close_values(
sorted(rows, reverse=True), mtol=self.mtol)
# make grid using x and y coord of shortlisted rows and cols
cols = [(cols[i], cols[i + 1])
for i in range(0, len(cols) - 1)]
rows = [(rows[i], rows[i + 1])
for i in range(0, len(rows) - 1)]
table = Table(cols, rows)
# set table edges to True using ver+hor lines
table = table.set_edges(v_s, h_s, jtol=self.jtol)
nouse = table.nocont_ / (len(v_s) + len(h_s))
table_info['line_p'] = 100 * (1 - nouse)
# set spanning cells to True
table = table.set_spanning()
# set table border edges to True
table = outline(table)
if self.debug is not None:
debug_page_tables.append(table)
if self.debug:
self.debug_tables.append(table)
# fill text after sorting it
if rotated == '':
text_bbox.sort(key=lambda x: (-x.y0, x.x0))
elif rotated == 'left':
text_bbox.sort(key=lambda x: (x.x0, x.y0))
elif rotated == 'right':
text_bbox.sort(key=lambda x: (-x.x0, -x.y0))
for t in text_bbox:
r_idx = get_row_index(t, rows)
c_idx = get_column_index(t, cols)
if None in [r_idx, c_idx]:
# couldn't assign LTChar to any cell
pass
else:
r_idx, c_idx = reduce_index(
table, rotated, r_idx, c_idx)
table.cells[r_idx][c_idx].add_text(
t.get_text().strip('\n'))
# fill text after sorting it
if rotated == '':
text_bbox.sort(key=lambda x: (-x.y0, x.x0))
elif rotated == 'left':
text_bbox.sort(key=lambda x: (x.x0, x.y0))
elif rotated == 'right':
text_bbox.sort(key=lambda x: (-x.x0, -x.y0))
if self.fill is not None:
table = fill_spanning(table, fill=self.fill)
ar = table.get_list()
if rotated == 'left':
ar = zip(*ar[::-1])
elif rotated == 'right':
ar = zip(*ar[::1])
ar.reverse()
ar = remove_empty(ar)
ar = [list(o) for o in ar]
page_tables.append(encode_list(ar))
vprint(pkey)
self.tables[pkey] = page_tables
rerror = []
cerror = []
for t in text_bbox:
try:
r_idx, rass_error = get_row_index(t, rows)
except TypeError:
# couldn't assign LTChar to any cell
continue
try:
c_idx, cass_error = get_column_index(t, cols)
except TypeError:
# couldn't assign LTChar to any cell
continue
rerror.append(rass_error)
cerror.append(cass_error)
r_idx, c_idx = reduce_index(
table, rotated, r_idx, c_idx)
table.cells[r_idx][c_idx].add_text(
t.get_text().strip('\n'))
score = get_score([[50, rerror], [50, cerror]])
table_info['score'] = score
if self.debug is not None:
self.debug_tables[pkey] = debug_page_tables
if self.fill is not None:
table = fill_spanning(table, fill=self.fill)
ar = table.get_list()
if rotated == 'left':
ar = zip(*ar[::-1])
elif rotated == 'right':
ar = zip(*ar[::1])
ar.reverse()
ar = encode_list(ar)
table_info['data'] = ar
empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)
table_info['empty_p'] = empty_p
table_info['r_nempty_cells'] = r_nempty_cells
table_info['c_nempty_cells'] = c_nempty_cells
table_info['nrows'] = len(ar)
table_info['ncols'] = len(ar[0])
page_tables['table_{0}'.format(table_no)] = table_info
table_no += 1
pdf_page[os.path.basename(bname)] = page_tables
if self.pdfobject.clean:
self.pdfobject.remove_tempdir()
if self.debug is not None:
if self.debug:
return None
return self.tables
def plot_geometry(self, geometry):
"""Plots various pdf geometries that are detected so user can choose
tweak scale, jtol, mtol parameters.
"""
import matplotlib.pyplot as plt
if geometry == 'contour':
for pkey in self.debug_images.keys():
img, table_bbox = self.debug_images[pkey]
for t in table_bbox.keys():
cv2.rectangle(img, (t[0], t[1]),
(t[2], t[3]), (255, 0, 0), 3)
plt.imshow(img)
plt.show()
elif geometry == 'joint':
x_coord = []
y_coord = []
for pkey in self.debug_images.keys():
img, table_bbox = self.debug_images[pkey]
for k in table_bbox.keys():
for coord in table_bbox[k]:
x_coord.append(coord[0])
y_coord.append(coord[1])
max_x, max_y = max(x_coord), max(y_coord)
plt.plot(x_coord, y_coord, 'ro')
plt.axis([0, max_x + 100, max_y + 100, 0])
plt.imshow(img)
plt.show()
elif geometry == 'line':
for pkey in self.debug_segments.keys():
v_s, h_s = self.debug_segments[pkey]
for v in v_s:
plt.plot([v[0], v[2]], [v[1], v[3]])
for h in h_s:
plt.plot([h[0], h[2]], [h[1], h[3]])
plt.show()
elif geometry == 'table':
for pkey in self.debug_tables.keys():
for table in self.debug_tables[pkey]:
for i in range(len(table.cells)):
for j in range(len(table.cells[i])):
if table.cells[i][j].left:
plt.plot([table.cells[i][j].lb[0],
table.cells[i][j].lt[0]],
[table.cells[i][j].lb[1],
table.cells[i][j].lt[1]])
if table.cells[i][j].right:
plt.plot([table.cells[i][j].rb[0],
table.cells[i][j].rt[0]],
[table.cells[i][j].rb[1],
table.cells[i][j].rt[1]])
if table.cells[i][j].top:
plt.plot([table.cells[i][j].lt[0],
table.cells[i][j].rt[0]],
[table.cells[i][j].lt[1],
table.cells[i][j].rt[1]])
if table.cells[i][j].bottom:
plt.plot([table.cells[i][j].lb[0],
table.cells[i][j].rb[0]],
[table.cells[i][j].lb[1],
table.cells[i][j].rb[1]])
plt.show()
return pdf_page

View File

@ -1,18 +1,11 @@
import os
import shutil
import tempfile
import itertools
import multiprocessing as mp
import cv2
from PyPDF2 import PdfFileReader, PdfFileWriter
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTChar, LTTextLineHorizontal
from wand.image import Image
__all__ = ['Pdf']
@ -38,38 +31,6 @@ def _parse_page_numbers(pagenos):
return page_numbers
def _extract_text_objects(layout, LTObject, t=None):
"""Recursively parses pdf layout to get a list of
text objects.
Parameters
----------
layout : object
Layout object.
LTObject : object
Text object, either LTChar or LTTextLineHorizontal.
t : list (optional, default: None)
Returns
-------
t : list
List of text objects.
"""
if t is None:
t = []
try:
for obj in layout._objs:
if isinstance(obj, LTObject):
t.append(obj)
else:
t += _extract_text_objects(obj, LTObject)
except AttributeError:
pass
return t
class Pdf:
"""Handles all pdf operations which include:
@ -99,66 +60,163 @@ class Pdf:
is greater than word_margin. (optional, default: 0.1)
"""
def __init__(self, pdfname, pagenos=[{'start': 1, 'end': 1}],
char_margin=2.0, line_margin=0.5, word_margin=0.1,
clean=False):
def __init__(self, extractor, pdfname, pagenos=[{'start': 1, 'end': 1}],
parallel=False, clean=False):
self.extractor = extractor
self.pdfname = pdfname
if not self.pdfname.endswith('.pdf'):
raise TypeError("Only PDF format is supported right now.")
self.pagenos = _parse_page_numbers(pagenos)
self.char_margin = char_margin
self.line_margin = line_margin
self.word_margin = word_margin
self.parallel = parallel
self.cpu_count = mp.cpu_count()
self.pool = mp.Pool(processes=self.cpu_count)
self.clean = clean
self.temp = tempfile.mkdtemp()
def split(self):
"""Splits pdf into single page pdfs.
"""
if not self.pdfname.endswith('.pdf'):
raise TypeError("Only PDF format is supported.")
infile = PdfFileReader(open(self.pdfname, 'rb'), strict=False)
for p in self.pagenos:
page = infile.getPage(p - 1)
outfile = PdfFileWriter()
outfile.addPage(page)
with open(os.path.join(self.temp, 'pg-{0}.pdf'.format(p)), 'wb') as f:
with open(os.path.join(self.temp, 'page-{0}.pdf'.format(p)), 'wb') as f:
outfile.write(f)
def remove_tempdir(self):
shutil.rmtree(self.temp)
def extract(self):
"""Extracts text objects, width, height from a pdf.
"""
for p in self.pagenos:
pkey = 'pg-{0}'.format(p)
pname = os.path.join(self.temp, '{}.pdf'.format(pkey))
with open(pname, 'r') as f:
parser = PDFParser(f)
document = PDFDocument(parser)
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
laparams = LAParams(char_margin=self.char_margin,
line_margin=self.line_margin,
word_margin=self.word_margin)
rsrcmgr = PDFResourceManager()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
layout = device.get_result()
lattice_objects = _extract_text_objects(layout, LTChar)
stream_objects = _extract_text_objects(
layout, LTTextLineHorizontal)
width = layout.bbox[2]
height = layout.bbox[3]
yield p, lattice_objects, stream_objects, width, height
self.split()
pages = [os.path.join(self.temp, 'page-{0}.pdf'.format(p))
for p in self.pagenos]
if self.parallel:
tables = self.pool.map(self.extractor.get_tables, pages)
tables = {k: v for d in tables if d is not None for k, v in d.items()}
else:
tables = {}
if self.extractor.debug:
if self.extractor.method == 'stream':
self.debug = self.extractor.debug
self.debug_text = []
elif self.extractor.method == 'lattice':
self.debug = self.extractor.debug
self.debug_images = []
self.debug_segments = []
self.debug_tables = []
for p in pages:
table = self.extractor.get_tables(p)
if table is not None:
tables.update(table)
if self.extractor.debug:
if self.extractor.method == 'stream':
self.debug_text.append(self.extractor.debug_text)
elif self.extractor.method == 'lattice':
self.debug_images.append(self.extractor.debug_images)
self.debug_segments.append(self.extractor.debug_segments)
self.debug_tables.append(self.extractor.debug_tables)
if self.clean:
self.remove_tempdir()
return tables
def convert(self):
"""Converts single page pdfs to images.
def debug_plot(self):
"""Plots all text objects and various pdf geometries so that
user can choose number of columns, columns x-coordinates for
Stream or tweak Lattice parameters (scale, jtol, mtol).
"""
for p in self.pagenos:
pdfname = os.path.join(self.temp, 'pg-{0}.pdf'.format(p))
imagename = os.path.join(self.temp, 'pg-{0}.png'.format(p))
with Image(filename=pdfname, depth=8, resolution=300) as png:
png.save(filename=imagename)
import matplotlib.pyplot as plt
import matplotlib.patches as patches
def remove_tempdir(self):
shutil.rmtree(self.temp)
if self.debug is True:
try:
for text in self.debug_text:
fig = plt.figure()
ax = fig.add_subplot(111, aspect='equal')
xs, ys = [], []
for t in text:
xs.extend([t[0], t[1]])
ys.extend([t[2], t[3]])
ax.add_patch(
patches.Rectangle(
(t[0], t[1]),
t[2] - t[0],
t[3] - t[1]
)
)
ax.set_xlim(min(xs) - 10, max(xs) + 10)
ax.set_ylim(min(ys) - 10, max(ys) + 10)
plt.show()
except AttributeError:
raise ValueError("This option only be used with Stream.")
elif self.debug == 'contour':
try:
for img, table_bbox in self.debug_images:
for t in table_bbox.keys():
cv2.rectangle(img, (t[0], t[1]),
(t[2], t[3]), (255, 0, 0), 3)
plt.imshow(img)
plt.show()
except AttributeError:
raise ValueError("This option only be used with Lattice.")
elif self.debug == 'joint':
try:
for img, table_bbox in self.debug_images:
x_coord = []
y_coord = []
for k in table_bbox.keys():
for coord in table_bbox[k]:
x_coord.append(coord[0])
y_coord.append(coord[1])
max_x, max_y = max(x_coord), max(y_coord)
plt.plot(x_coord, y_coord, 'ro')
plt.axis([0, max_x + 100, max_y + 100, 0])
plt.imshow(img)
plt.show()
except AttributeError:
raise ValueError("This option only be used with Lattice.")
elif self.debug == 'line':
try:
for v_s, h_s in self.debug_segments:
for v in v_s:
plt.plot([v[0], v[2]], [v[1], v[3]])
for h in h_s:
plt.plot([h[0], h[2]], [h[1], h[3]])
plt.show()
except AttributeError:
raise ValueError("This option only be used with Lattice.")
elif self.debug == 'table':
try:
for tables in self.debug_tables:
for table in tables:
for i in range(len(table.cells)):
for j in range(len(table.cells[i])):
if table.cells[i][j].left:
plt.plot([table.cells[i][j].lb[0],
table.cells[i][j].lt[0]],
[table.cells[i][j].lb[1],
table.cells[i][j].lt[1]])
if table.cells[i][j].right:
plt.plot([table.cells[i][j].rb[0],
table.cells[i][j].rt[0]],
[table.cells[i][j].rb[1],
table.cells[i][j].rt[1]])
if table.cells[i][j].top:
plt.plot([table.cells[i][j].lt[0],
table.cells[i][j].rt[0]],
[table.cells[i][j].lt[1],
table.cells[i][j].rt[1]])
if table.cells[i][j].bottom:
plt.plot([table.cells[i][j].lb[0],
table.cells[i][j].rb[0]],
[table.cells[i][j].lb[1],
table.cells[i][j].rb[1]])
plt.show()
except AttributeError:
raise ValueError("This option only be used with Lattice.")
else:
raise UserWarning("This method can only be called after"
" debug has been specified.")

View File

@ -1,14 +1,26 @@
from __future__ import print_function
from __future__ import division
import os
import types
import copy_reg
import logging
import numpy as np
from .utils import get_column_index, encode_list
from .table import Table
from .utils import get_row_index, get_score, count_empty, encode_list, pdf_to_text
__all__ = ['Stream']
def _reduce_method(m):
if m.im_self is None:
return getattr, (m.im_class, m.im_func.func_name)
else:
return getattr, (m.im_self, m.im_func.func_name)
copy_reg.pickle(types.MethodType, _reduce_method)
def _group_rows(text, ytol=2):
"""Groups text objects into rows using ytol.
@ -35,14 +47,16 @@ def _group_rows(text, ytol=2):
# type(obj) is LTChar]):
if t.get_text().strip():
if not np.isclose(row_y, t.y0, atol=ytol):
row_y = t.y0
rows.append(temp)
rows.append(sorted(temp, key=lambda t: t.x0))
temp = []
row_y = t.y0
temp.append(t)
rows.append(sorted(temp, key=lambda t: t.x0))
__ = rows.pop(0) # hacky
return rows
def _merge_columns(l):
def _merge_columns(l, mtol=2):
"""Merges overlapping columns and returns list with updated
columns boundaries.
@ -62,7 +76,8 @@ def _merge_columns(l):
merged.append(higher)
else:
lower = merged[-1]
if higher[0] <= lower[1]:
if (higher[0] <= lower[1] or
np.isclose(higher[0], lower[1], atol=mtol)):
upper_bound = max(lower[1], higher[1])
lower_bound = min(lower[0], higher[0])
merged[-1] = (lower_bound, upper_bound)
@ -71,6 +86,62 @@ def _merge_columns(l):
return merged
def _get_column_index(t, columns):
"""Gets index of the column in which the given object falls by
comparing their co-ordinates.
Parameters
----------
t : object
columns : list
Returns
-------
c : int
"""
offset1, offset2 = 0, 0
lt_col_overlap = []
for c in columns:
if c[0] <= t.x1 and c[1] >= t.x0:
left = t.x0 if c[0] <= t.x0 else c[0]
right = t.x1 if c[1] >= t.x1 else c[1]
lt_col_overlap.append(abs(left - right) / abs(c[0] - c[1]))
else:
lt_col_overlap.append(-1)
if len(filter(lambda x: x != -1, lt_col_overlap)) == 0:
logging.warning("Text doesn't fit any column.")
c_idx = lt_col_overlap.index(max(lt_col_overlap))
if t.x0 < columns[c_idx][0]:
offset1 = abs(t.x0 - columns[c_idx][0])
if t.x1 > columns[c_idx][1]:
offset2 = abs(t.x1 - columns[c_idx][1])
Y = abs(t.y0 - t.y1)
charea = abs(t.x0 - t.x1) * abs(t.y0 - t.y1)
error = (Y * (offset1 + offset2)) / charea
return c_idx, error
def _add_columns(cols, text, ytolerance):
if text:
text = _group_rows(text, ytol=ytolerance)
elements = [len(r) for r in text]
new_cols = [(t.x0, t.x1)
for r in text if len(r) == max(elements) for t in r]
cols.extend(_merge_columns(sorted(new_cols)))
return cols
def _join_columns(cols, width):
cols = sorted(cols)
cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))]
cols.insert(0, 0)
cols.append(width) # or some tolerance
cols = [(cols[i], cols[i + 1])
for i in range(0, len(cols) - 1)]
return cols
class Stream:
"""Stream algorithm
@ -105,20 +176,18 @@ class Stream:
page as value.
"""
def __init__(self, pdfobject, ncolumns=0, columns=None, ytol=2,
debug=False, verbose=False):
def __init__(self, ncolumns=0, columns=None, ytol=2, mtol=2,
pdf_margin=(2.0, 0.5, 0.1), debug=False):
self.pdfobject = pdfobject
self.method = 'stream'
self.ncolumns = ncolumns
self.columns = columns
self.ytol = ytol
self.mtol = mtol
self.char_margin, self.line_margin, self.word_margin = pdf_margin
self.debug = debug
self.verbose = verbose
self.tables = {}
if self.debug:
self.debug_text = {}
def get_tables(self):
def get_tables(self, pdfname):
"""Returns all tables found in given pdf.
Returns
@ -127,86 +196,112 @@ class Stream:
Dictionary with page number as key and list of tables on that
page as value.
"""
vprint = print if self.verbose else lambda *a, **k: None
self.pdfobject.split()
for page in self.pdfobject.extract():
p, __, text, __, __ = page
pkey = 'pg-{0}'.format(p)
text.sort(key=lambda x: (-x.y0, x.x0))
if self.debug:
self.debug_text[pkey] = text
rows = _group_rows(text, ytol=self.ytol)
elements = [len(r) for r in rows]
# a table can't have just 1 column, can it?
elements = filter(lambda x: x != 1, elements)
guess = False
if self.columns:
cols = self.columns.split(',')
cols = [(float(cols[i]), float(cols[i + 1]))
for i in range(0, len(cols) - 1)]
else:
guess = True
ncols = self.ncolumns if self.ncolumns else max(
set(elements), key=elements.count)
if ncols == 0:
# no tables detected
continue
cols = [(t.x0, t.x1)
for r in rows for t in r if len(r) == ncols]
cols = _merge_columns(sorted(cols))
cols = [(c[0] + c[1]) / 2.0 for c in cols]
ar = [['' for c in cols] for r in rows]
for r_idx, r in enumerate(rows):
for t in r:
if guess:
cog = (t.x0 + t.x1) / 2.0
diff = [abs(cog - c) for c in cols]
c_idx = diff.index(min(diff))
else:
c_idx = get_column_index(t, cols)
if None in [r_idx, c_idx]: # couldn't assign LTTextLH to any cell
continue
if ar[r_idx][c_idx]:
ar[r_idx][c_idx] = ' '.join(
[ar[r_idx][c_idx], t.get_text().strip()])
else:
ar[r_idx][c_idx] = t.get_text().strip()
vprint(pkey)
self.tables[pkey] = [encode_list(ar)]
if self.pdfobject.clean:
self.pdfobject.remove_tempdir()
__, text, width, height = pdf_to_text(pdfname, self.char_margin,
self.line_margin, self.word_margin)
bname, __ = os.path.splitext(pdfname)
if not text:
logging.warning("{0}: PDF has no text. It may be an image.".format(
os.path.basename(bname)))
return None
text.sort(key=lambda x: (-x.y0, x.x0))
if self.debug:
self.debug_text = [(t.x0, t.y0, t.x1, t.y1) for t in text]
return None
return self.tables
rows_grouped = _group_rows(text, ytol=self.ytol)
elements = [len(r) for r in rows_grouped]
row_mids = [sum([(t.y0 + t.y1) / 2 for t in r]) / len(r)
if len(r) > 0 else 0 for r in rows_grouped]
rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))]
rows.insert(0, height) # or some tolerance
rows.append(0)
rows = [(rows[i], rows[i + 1])
for i in range(0, len(rows) - 1)]
def plot_text(self):
"""Plots all text objects so user can choose number of columns
or columns x-coordinates using the matplotlib interface.
"""
import matplotlib.pyplot as plt
import matplotlib.patches as patches
guess = False
if self.columns:
# user has to input boundary columns too
# take (0, width) by default
# similar to else condition
# len can't be 1
cols = self.columns.split(',')
cols = [(float(cols[i]), float(cols[i + 1]))
for i in range(0, len(cols) - 1)]
else:
if self.ncolumns:
ncols = self.ncolumns
cols = [(t.x0, t.x1)
for r in rows_grouped if len(r) == ncols for t in r]
cols = _merge_columns(sorted(cols), mtol=self.mtol)
if len(cols) != self.ncolumns:
logging.warning("{}: The number of columns after merge"
" isn't the same as what you specified."
" Change the value of mtol.".format(
os.path.basename(bname)))
cols = _join_columns(cols, width)
else:
guess = True
ncols = max(set(elements), key=elements.count)
len_non_mode = len(filter(lambda x: x != ncols, elements))
if ncols == 1 and not self.debug:
# no tables detected
logging.warning("{}: Only one column was detected, the PDF"
" may have no tables. Specify ncols if"
" the PDF has tables.".format(
os.path.basename(bname)))
cols = [(t.x0, t.x1)
for r in rows_grouped if len(r) == ncols for t in r]
cols = _merge_columns(sorted(cols), mtol=self.mtol)
inner_text = []
for i in range(1, len(cols)):
left = cols[i - 1][1]
right = cols[i][0]
inner_text.extend([t for t in text if t.x0 > left and t.x1 < right])
outer_text = [t for t in text if t.x0 > cols[-1][1] or t.x1 < cols[0][0]]
inner_text.extend(outer_text)
cols = _add_columns(cols, inner_text, self.ytol)
cols = _join_columns(cols, width)
for pkey in sorted(self.debug_text.keys()):
fig = plt.figure()
ax = fig.add_subplot(111, aspect='equal')
xs, ys = [], []
for t in self.debug_text[pkey]:
xs.extend([t.x0, t.x1])
ys.extend([t.y0, t.y1])
ax.add_patch(
patches.Rectangle(
(t.x0, t.y0),
t.x1 - t.x0,
t.y1 - t.y0
)
)
ax.set_xlim(min(xs) - 10, max(xs) + 10)
ax.set_ylim(min(ys) - 10, max(ys) + 10)
plt.show()
pdf_page = {}
page_tables = {}
table_info = {}
table = Table(cols, rows)
rerror = []
cerror = []
for row in rows_grouped:
for t in row:
try:
r_idx, rass_error = get_row_index(t, rows)
except ValueError as e:
# couldn't assign LTTextLH to any cell
vprint(e.message)
continue
try:
c_idx, cass_error = _get_column_index(t, cols)
except ValueError as e:
# couldn't assign LTTextLH to any cell
vprint(e.message)
continue
rerror.append(rass_error)
cerror.append(cass_error)
table.cells[r_idx][c_idx].add_text(
t.get_text().strip('\n'))
if guess:
score = get_score([[33, rerror], [33, cerror], [34, [len_non_mode / len(elements)]]])
else:
score = get_score([[50, rerror], [50, cerror]])
table_info['score'] = score
ar = table.get_list()
ar = encode_list(ar)
table_info['data'] = ar
empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)
table_info['empty_p'] = empty_p
table_info['r_nempty_cells'] = r_nempty_cells
table_info['c_nempty_cells'] = c_nempty_cells
table_info['nrows'] = len(ar)
table_info['ncols'] = len(ar[0])
page_tables['table_1'] = table_info
pdf_page[os.path.basename(bname)] = page_tables
return pdf_page

View File

@ -26,6 +26,7 @@ class Table:
self.rows = rows
self.cells = [[Cell(c[0], r[1], c[1], r[0])
for c in cols] for r in rows]
self.nocont_ = 0
def set_edges(self, vertical, horizontal, jtol=2):
"""Sets cell edges to True if corresponding line segments
@ -53,6 +54,7 @@ class Table:
k = [k for k, t in enumerate(self.rows)
if np.isclose(v[1], t[0], atol=jtol)]
if not j:
self.nocont_ += 1
continue
J = j[0]
if i == [0]: # only left edge
@ -104,6 +106,7 @@ class Table:
k = [k for k, t in enumerate(self.cols)
if np.isclose(h[2], t[0], atol=jtol)]
if not j:
self.nocont_ += 1
continue
J = j[0]
if i == [0]: # only top edge

View File

@ -1,5 +1,18 @@
from __future__ import division
import os
import numpy as np
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTChar, LTTextLineHorizontal
def translate(x1, x2):
"""Translates x2 by x1.
@ -243,15 +256,24 @@ def get_row_index(t, rows):
----------
t : object
rows : list
rows : list, sorted in decreasing order
Returns
-------
r : int
"""
offset1, offset2 = 0, 0
for r in range(len(rows)):
if (t.y0 + t.y1) / 2.0 < rows[r][0] and (t.y0 + t.y1) / 2.0 > rows[r][1]:
return r
if t.y0 > rows[r][0]:
offset1 = abs(t.y0 - rows[r][0])
if t.y1 < rows[r][1]:
offset2 = abs(t.y1 - rows[r][1])
X = 1.0 if abs(t.x0 - t.x1) == 0.0 else abs(t.x0 - t.x1)
Y = 1.0 if abs(t.y0 - t.y1) == 0.0 else abs(t.y0 - t.y1)
charea = X * Y
error = (X * (offset1 + offset2)) / charea
return r, error
def get_column_index(t, columns):
@ -268,9 +290,45 @@ def get_column_index(t, columns):
-------
c : int
"""
offset1, offset2 = 0, 0
for c in range(len(columns)):
if (t.x0 + t.x1) / 2.0 > columns[c][0] and (t.x0 + t.x1) / 2.0 < columns[c][1]:
return c
if t.x0 < columns[c][0]:
offset1 = abs(t.x0 - columns[c][0])
if t.x1 > columns[c][1]:
offset2 = abs(t.x1 - columns[c][1])
X = 1.0 if abs(t.x0 - t.x1) == 0.0 else abs(t.x0 - t.x1)
Y = 1.0 if abs(t.y0 - t.y1) == 0.0 else abs(t.y0 - t.y1)
charea = X * Y
error = (Y * (offset1 + offset2)) / charea
return c, error
def get_score(error_weights):
"""Calculates score based on weights assigned to various parameters,
and their error percentages.
Parameters
----------
error_weights : dict
Dict with a tuple of error percentages as key and weightage
assigned to them as value. Sum of all values should be equal
to 100.
Returns
-------
score : float
"""
SCORE_VAL = 100
score = 0
if sum([ew[0] for ew in error_weights]) != SCORE_VAL:
raise ValueError("Please assign a valid weightage to each parameter"
" such that their sum is equal to 100")
for ew in error_weights:
weight = ew[0] / len(ew[1])
for error_percentage in ew[1]:
score += weight * (1 - error_percentage)
return score
def reduce_index(t, rotated, r_idx, c_idx):
@ -394,6 +452,110 @@ def remove_empty(d):
return d
def count_empty(d):
"""Counts empty rows and columns from list of lists.
Parameters
----------
d : list
Returns
-------
n_empty_rows : number of empty rows
n_empty_cols : number of empty columns
empty_p : percentage of empty cells
"""
empty_p = 0
r_nempty_cells, c_nempty_cells = [], []
for i in d:
for j in i:
if j.strip() == '':
empty_p += 1
empty_p = 100 * (empty_p / float(len(d) * len(d[0])))
for row in d:
r_nempty_c = 0
for r in row:
if r.strip() != '':
r_nempty_c += 1
r_nempty_cells.append(r_nempty_c)
d = zip(*d)
d = [list(col) for col in d]
for col in d:
c_nempty_c = 0
for c in col:
if c.strip() != '':
c_nempty_c += 1
c_nempty_cells.append(c_nempty_c)
return empty_p, r_nempty_cells, c_nempty_cells
def encode_list(ar):
"""Encodes list of text.
Parameters
----------
ar : list
Returns
-------
ar : list
"""
ar = [[r.encode('utf-8') for r in row] for row in ar]
return ar
def extract_text_objects(layout, LTObject, t=None):
"""Recursively parses pdf layout to get a list of
text objects.
Parameters
----------
layout : object
Layout object.
LTObject : object
Text object, either LTChar or LTTextLineHorizontal.
t : list (optional, default: None)
Returns
-------
t : list
List of text objects.
"""
if t is None:
t = []
try:
for obj in layout._objs:
if isinstance(obj, LTObject):
t.append(obj)
else:
t += extract_text_objects(obj, LTObject)
except AttributeError:
pass
return t
def pdf_to_text(pname, char_margin, line_margin, word_margin):
# pkey = 'page-{0}'.format(p)
# pname = os.path.join(self.temp, '{}.pdf'.format(pkey))
with open(pname, 'r') as f:
parser = PDFParser(f)
document = PDFDocument(parser)
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
laparams = LAParams(char_margin=char_margin,
line_margin=line_margin,
word_margin=word_margin)
rsrcmgr = PDFResourceManager()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
layout = device.get_result()
lattice_objects = extract_text_objects(layout, LTChar)
stream_objects = extract_text_objects(
layout, LTTextLineHorizontal)
width = layout.bbox[2]
height = layout.bbox[3]
return lattice_objects, stream_objects, width, height

View File

@ -39,7 +39,7 @@ Usage
>>> extractor = Lattice(Pdf('us-030.pdf'))
>>> tables = extractor.get_tables()
>>> print tables['pg-1']
>>> print tables['page-1'][0]
.. csv-table::
:header: "Cycle Name","KI (1/km)","Distance (mi)","Percent Fuel Savings","","",""

View File

@ -65,7 +65,7 @@ Finally, the characters found on the page are assigned to cells based on their x
>>> extractor = Lattice(Pdf('us-030.pdf'))
>>> tables = extractor.get_tables()
>>> print tables['pg-1']
>>> print tables['page-1'][0]
.. csv-table::
:header: "Cycle Name","KI (1/km)","Distance (mi)","Percent Fuel Savings","","",""
@ -114,7 +114,7 @@ In the PDF used above, you can see that some cells spanned a lot of rows, `fill`
>>> extractor = Lattice(Pdf('row_span_1.pdf'), fill='v', scale=40)
>>> tables = extractor.get_tables()
>>> print tables['pg-1']
>>> print tables['page-1'][0]
.. csv-table::
:header: "Plan Type","County","Plan Name","Totals"
@ -173,7 +173,7 @@ To find line segments, Lattice needs the lines of the PDF to be in foreground. S
>>> extractor = Lattice(Pdf('lines_in_background_1.pdf'), invert=True)
>>> tables = extractor.get_tables()
>>> print tables['pg-1']
>>> print tables['page-1'][0]
.. csv-table::
:header: "State","Date","Halt stations","Halt days","Persons directly reached(in lakh)","Persons trained","Persons counseled","Persons testedfor HIV"

View File

@ -17,7 +17,7 @@ Let's run it on this PDF.
>>> extractor = Stream(Pdf('eu-027.pdf'))
>>> tables = extractor.get_tables()
>>> print tables['pg-1']
>>> print tables['page-1'][0]
.. .. _this: insert link for eu-027.pdf
@ -68,7 +68,7 @@ But sometimes its guess could be incorrect, like in this case.
>>> extractor = Stream(Pdf('missing_values.pdf'))
>>> tables = extractor.get_tables()
>>> print tables['pg-1']
>>> print tables['page-1'][0]
.. .. _this: insert link for missing_values.pdf
@ -127,7 +127,7 @@ It guessed that the PDF has 3 columns, because there wasn't any data in the last
>>> extractor = Stream(Pdf('missing_values.pdf'), ncolumns=5)
>>> tables = extractor.get_tables()
>>> print tables['pg-1']
>>> print tables['page-1'][0]
.. csv-table::
@ -200,7 +200,7 @@ After getting the x-coordinates, we just need to pass them to Stream, like this.
>>> extractor = Stream(Pdf('mexican_towns.pdf'), columns='28,67,180,230,425,475,700')
>>> tables = extractor.get_tables()
>>> print tables['pg-1']
>>> print tables['page-1'][0]
.. csv-table::

View File

@ -26,7 +26,7 @@ def test_lattice_basic():
extractor = Lattice(Pdf(pdfname,
pagenos=[{'start': 2, 'end': 2}], clean=True))
tables = extractor.get_tables()
assert_equal(tables['pg-2'][0], data)
assert_equal(tables['page-2'][0], data)
def test_lattice_fill():
@ -76,7 +76,7 @@ def test_lattice_fill():
pdfname = os.path.join(testdir, 'row_span_1.pdf')
extractor = Lattice(Pdf(pdfname, clean=True), fill='v', scale=40)
tables = extractor.get_tables()
assert_equal(tables['pg-1'][0], data)
assert_equal(tables['pagea-1'][0], data)
def test_lattice_invert():
@ -94,4 +94,4 @@ def test_lattice_invert():
pdfname = os.path.join(testdir, 'lines_in_background_1.pdf')
extractor = Lattice(Pdf(pdfname, clean=True), invert=True)
tables = extractor.get_tables()
assert_equal(tables['pg-1'][1], data)
assert_equal(tables['page-1'][1], data)

View File

@ -13,57 +13,62 @@ testdir = os.path.dirname(os.path.abspath(__file__))
def test_stream_basic():
data = [
["","","","",""],
["C Appendix C: Summary Statistics","","","",""],
["","Table C1: Summary Statistics","","",""],
["","This table contains summary statistics for 2,012 respondents in SAVE 2009.","","",""],
["Variable","Mean","Std. Dev. Min","","Max"],
["Age","50.8","15.9","21","90"],
["Men","0.47","0.50","0","1"],
["East","0.28","0.45","0","1"],
["Rural","0.15","0.36","0","1"],
["Married","0.57","0.50","0","1"],
["Single","0.21","0.40","0","1"],
["Divorced","0.13","0.33","0","1"],
["Widowed","0.08","0.26","0","1"],
["Separated","0.03","0.16","0","1"],
["Partner","0.65","0.48","0","1"],
["Employed","0.55","0.50","0","1"],
["Fulltime","0.34","0.47","0","1"],
["Parttime","0.20","0.40","0","1"],
["Unemployed","0.08","0.28","0","1"],
["Homemaker","0.19","0.40","0","1"],
["Retired","0.28","0.45","0","1"],
["Household size","2.43","1.22","1","9"],
["Households with children","0.37","0.48","0","1"],
["Number of children","1.67","1.38","0","8"],
["Lower secondary education","0.08","0.27","0","1"],
["Upper secondary education","0.60","0.49","0","1"],
["Post secondary, non tert. education","0.12","0.33","0","1"],
["First stage tertiary education","0.17","0.38","0","1"],
["Other education","0.03","0.17","0","1"],
["Household income (Euro/month)","2,127","1,389","22","22,500"],
["Gross wealth - end of 2007 (Euro)","187,281","384,198","0","7,720,000"],
["Gross financial wealth - end of 2007 (Euro)","38,855","114,128","0","2,870,000"],
["","Source: SAVE 2008 and 2009, data is weighted and imputed.","","",""],
["","","","","ECB"],
["","","","","Working Paper Series No 1299"],
["","","","","Febuary 2011"]
["Clave","Nombre Entidad","Clave","Nombre Municipio","Clave","Nombre Localidad"],
["Entidad","","Municipio","","Localidad",""],
["01","Aguascalientes","001","Aguascalientes","0094","Granja Adelita"],
["01","Aguascalientes","001","Aguascalientes","0096","Agua Azul"],
["01","Aguascalientes","001","Aguascalientes","0100","Rancho Alegre"],
["01","Aguascalientes","001","Aguascalientes","0102","Los Arbolitos [Rancho]"],
["01","Aguascalientes","001","Aguascalientes","0104","Ardillas de Abajo (Las Ardillas)"],
["01","Aguascalientes","001","Aguascalientes","0106","Arellano"],
["01","Aguascalientes","001","Aguascalientes","0112","Bajío los Vázquez"],
["01","Aguascalientes","001","Aguascalientes","0113","Bajío de Montoro"],
["01","Aguascalientes","001","Aguascalientes","0114","Residencial San Nicolás [Baños la Cantera]"],
["01","Aguascalientes","001","Aguascalientes","0120","Buenavista de Peñuelas"],
["01","Aguascalientes","001","Aguascalientes","0121","Cabecita 3 Marías (Rancho Nuevo)"],
["01","Aguascalientes","001","Aguascalientes","0125","Cañada Grande de Cotorina"],
["01","Aguascalientes","001","Aguascalientes","0126","Cañada Honda [Estación]"],
["01","Aguascalientes","001","Aguascalientes","0127","Los Caños"],
["01","Aguascalientes","001","Aguascalientes","0128","El Cariñán"],
["01","Aguascalientes","001","Aguascalientes","0129","El Carmen [Granja]"],
["01","Aguascalientes","001","Aguascalientes","0135","El Cedazo (Cedazo de San Antonio)"],
["01","Aguascalientes","001","Aguascalientes","0138","Centro de Arriba (El Taray)"],
["01","Aguascalientes","001","Aguascalientes","0139","Cieneguilla (La Lumbrera)"],
["01","Aguascalientes","001","Aguascalientes","0141","Cobos"],
["01","Aguascalientes","001","Aguascalientes","0144","El Colorado (El Soyatal)"],
["01","Aguascalientes","001","Aguascalientes","0146","El Conejal"],
["01","Aguascalientes","001","Aguascalientes","0157","Cotorina de Abajo"],
["01","Aguascalientes","001","Aguascalientes","0162","Coyotes"],
["01","Aguascalientes","001","Aguascalientes","0166","La Huerta (La Cruz)"],
["01","Aguascalientes","001","Aguascalientes","0170","Cuauhtémoc (Las Palomas)"],
["01","Aguascalientes","001","Aguascalientes","0171","Los Cuervos (Los Ojos de Agua)"],
["01","Aguascalientes","001","Aguascalientes","0172","San José [Granja]"],
["01","Aguascalientes","001","Aguascalientes","0176","La Chiripa"],
["01","Aguascalientes","001","Aguascalientes","0182","Dolores"],
["01","Aguascalientes","001","Aguascalientes","0183","Los Dolores"],
["01","Aguascalientes","001","Aguascalientes","0190","El Duraznillo"],
["01","Aguascalientes","001","Aguascalientes","0191","Los Durón"],
["01","Aguascalientes","001","Aguascalientes","0197","La Escondida"],
["01","Aguascalientes","001","Aguascalientes","0201","Brande Vin [Bodegas]"],
["01","Aguascalientes","001","Aguascalientes","0207","Valle Redondo"],
["01","Aguascalientes","001","Aguascalientes","0209","La Fortuna"],
["01","Aguascalientes","001","Aguascalientes","0212","Lomas del Gachupín"],
["01","Aguascalientes","001","Aguascalientes","0213","El Carmen (Gallinas Güeras) [Rancho]"],
["01","Aguascalientes","001","Aguascalientes","0216","La Gloria"],
["01","Aguascalientes","001","Aguascalientes","0226","Hacienda Nueva"],
]
pdfname = os.path.join(testdir,
"tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-027.pdf")
extractor = Stream(Pdf(pdfname, pagenos=[{'start': 3, 'end': 3}],
pdfname = os.path.join(testdir, 'mexican_towns.pdf')
extractor = Stream(Pdf(pdfname, pagenos=[{'start': 1, 'end': 1}],
clean=True))
tables = extractor.get_tables()
assert_equal(tables['pg-3'][0], data)
assert_equal(tables['page-1'][0], data)
def test_stream_ncolumns():
data = [
["","","","",""],
["","Bhandara - Key Indicators","","",""],
["Bhandara - Key Indicators","","","",""],
["","DLHS-4 (2012-13)","","DLHS-3 (2007-08)",""],
["Indicators","TOTAL","RURAL","TOTAL","RURAL"],
["Reported Prevalence of Morbidity","","","",""],
@ -105,21 +110,20 @@ def test_stream_ncolumns():
["Moderately High (Systolic >160 mm of Hg & Diastolic >100 mm of Hg ) .....................................","8.2","7.1","",""],
["Very High (Systolic >180 mm of Hg & Diastolic >110 mm of Hg ) ...............................................","3.7","3.1","",""],
["14 Any anaemia below 11g/dl, severe anaemia below 7g/dl. 15 Excluding age group 19 years","","","",""],
["","Chronic Illness :Any person with symptoms persisting for longer than one month is defined as suffering from chronic illness","","",""]
["Chronic Illness :Any person with symptoms persisting for longer than one month is defined as suffering from chronic illness","","","",""],
["","4","","",""]
]
pdfname = os.path.join(testdir, 'missing_values.pdf')
extractor = Stream(Pdf(pdfname, char_margin=1.0, clean=True),
ncolumns=5)
tables = extractor.get_tables()
assert_equal(tables['pg-1'][0], data)
assert_equal(tables['page-1'][0], data)
def test_stream_columns():
data = [
["","","","","",""],
["Clave","","Clave","","Clave",""],
["","Nombre Entidad","","Nombre Municipio","","Nombre Localidad"],
["Clave","Nombre Entidad","Clave","Nombre Municipio","Clave","Nombre Localidad"],
["Entidad","","Municipio","","Localidad",""],
["01","Aguascalientes","001","Aguascalientes","0094","Granja Adelita"],
["01","Aguascalientes","001","Aguascalientes","0096","Agua Azul"],
@ -160,10 +164,11 @@ def test_stream_columns():
["01","Aguascalientes","001","Aguascalientes","0209","La Fortuna"],
["01","Aguascalientes","001","Aguascalientes","0212","Lomas del Gachupín"],
["01","Aguascalientes","001","Aguascalientes","0213","El Carmen (Gallinas Güeras) [Rancho]"],
["01","Aguascalientes","001","Aguascalientes","0216","La Gloria"]
["01","Aguascalientes","001","Aguascalientes","0216","La Gloria"],
["01","Aguascalientes","001","Aguascalientes","0226","Hacienda Nueva"],
]
pdfname = os.path.join(testdir, 'mexican_towns.pdf')
extractor = Stream(Pdf(pdfname, clean=True),
columns='28,67,180,230,425,475,700')
tables = extractor.get_tables()
assert_equal(tables['pg-1'][0], data)
assert_equal(tables['page-1'][0], data)

View File

@ -4,8 +4,12 @@ import os
import sys
import time
import logging
import warnings
import numpy as np
from docopt import docopt
from collections import Counter
import matplotlib.pyplot as plt
from PyPDF2 import PdfFileReader
from camelot.pdf import Pdf
@ -22,12 +26,23 @@ usage:
options:
-h, --help Show this screen.
-v, --version Show version.
-V, --verbose Verbose.
-p, --pages <pageno> Comma-separated list of page numbers.
Example: -p 1,3-6,10 [default: 1]
-P, --parallel Parallelize the parsing process.
-f, --format <format> Output format. (csv,tsv,html,json,xlsx) [default: csv]
-l, --log Print log to file.
-V, --verbose Verbose.
-l, --log Log to file.
-o, --output <directory> Output directory.
-M, --cmargin <cmargin> Char margin. Chars closer than cmargin are
grouped together to form a word. [default: 2.0]
-L, --lmargin <lmargin> Line margin. Lines closer than lmargin are
grouped together to form a textbox. [default: 0.5]
-W, --wmargin <wmargin> Word margin. Insert blank spaces between chars
if distance between words is greater than word
margin. [default: 0.1]
-S, --save-info Save parsing info for each page to a file.
-X, --plot <dist> Plot distributions. (page,all,rc)
-Z, --summary Summarize metrics.
camelot methods:
lattice Looks for lines between data.
@ -47,12 +62,12 @@ options:
cells. Example: -F h, -F v, -F hv
-s, --scale <scale> Scaling factor. Large scaling factor leads to
smaller lines being detected. [default: 15]
-i, --invert Invert pdf image to make sure that lines are
in foreground.
-j, --jtol <jtol> Tolerance to account for when comparing joint
and line coordinates. [default: 2]
-m, --mtol <mtol> Tolerance to account for when merging lines
which are very close. [default: 2]
-i, --invert Invert pdf image to make sure that lines are
in foreground.
-d, --debug <debug> Debug by visualizing pdf geometry.
(contour,line,joint,table) Example: -d table
"""
@ -69,17 +84,159 @@ options:
Example: -c 10.1,20.2,30.3
-y, --ytol <ytol> Tolerance to account for when grouping rows
together. [default: 2]
-M, --cmargin <cmargin> Char margin. Chars closer than cmargin are
grouped together to form a word. [default: 2.0]
-L, --lmargin <lmargin> Line margin. Lines closer than lmargin are
grouped together to form a textbox. [default: 0.5]
-W, --wmargin <wmargin> Word margin. Insert blank spaces between chars
if distance between words is greater than word
margin. [default: 0.1]
-m, --mtol <mtol> Tolerance to account for when merging columns
together. [default: 2]
-d, --debug Debug by visualizing textboxes.
"""
def plot_table_barchart(r, c, p, pno, tno):
row_idx = [i + 1 for i, row in enumerate(r)]
col_idx = [i + 1 for i, col in enumerate(c)]
r_index = np.arange(len(r))
c_index = np.arange(len(c))
width = 0.7
plt.figure(figsize=(8, 6))
plt.subplot(2, 1, 1)
plt.title('Percentage of empty cells in table: {0:.2f}'.format(p))
plt.xlabel('row index')
plt.ylabel('number of non-empty cells in row')
plt.bar(r_index, r)
plt.xticks(r_index + width * 0.5, row_idx)
plt.ylim(0, len(c))
plt.subplot(2, 1, 2)
plt.xlabel('column index')
plt.ylabel('number of non-empty cells in column')
plt.bar(c_index, c)
plt.xticks(c_index + width * 0.5, col_idx)
plt.ylim(0, len(r))
plt.savefig(''.join([pno, '_', tno, '.png']), dpi=300)
def plot_all_barchart(data, output):
r_empty_cells = []
for page_number in data.keys():
page = data[page_number]
for table_number in page.keys():
table = page[table_number]
r_empty_cells.extend([r / float(table['ncols']) for r in table['r_nempty_cells']])
c = Counter(r_empty_cells)
if 0.0 not in c:
c.update({0.0: 0})
if 1.0 not in c:
c.update({1.0: 0})
plt.figure(figsize=(8, 6))
plt.xlabel('percentage of non-empty cells in a row')
plt.ylabel('percentage of rows processed')
row_p = [count / float(sum(c.values())) for count in c.values()]
plt.bar(c.keys(), row_p, align='center', width=0.05)
plt.ylim(0, 1.0)
plt.savefig(''.join([output, '_all.png']), dpi=300)
def plot_rc_piechart(data, output):
from matplotlib import cm
tables = 0
rows, cols = [], []
for page_number in data.keys():
page = data[page_number]
for table_number in page.keys():
table = page[table_number]
tables += 1
rows.append(table['nrows'])
cols.append(table['ncols'])
r = Counter(rows)
c = Counter(cols)
plt.figure(figsize=(8, 6))
cs1 = cm.Set1(np.arange(len(r)) / float(len(r)))
ax1 = plt.subplot(211, aspect='equal')
ax1.pie(r.values(), colors=cs1, labels=r.keys(), startangle=90)
ax1.set_title('row distribution across tables')
cs2 = cm.Set1(np.arange(len(c)) / float(len(c)))
ax2 = plt.subplot(212, aspect='equal')
ax2.pie(c.values(), colors=cs2, labels=c.keys(), startangle=90)
ax2.set_title('column distribution across tables')
plt.savefig(''.join([output, '_rc.png']), dpi=300)
def summary(data, p_time):
from operator import itemgetter
from itertools import groupby
scores = []
continuous_tables = []
total_tables = 0
for page_number in data.keys():
page = data[page_number]
total_tables += len(page.keys())
for table_number in page.keys():
table = page[table_number]
continuous_tables.append((page_number, table_number, table['ncols']))
scores.append(table['score'])
avg_score = np.mean(scores)
ct_pages = []
header_string = ""
if len(continuous_tables) > 1:
tables = sorted(continuous_tables, key=lambda x: (int(x[0][5:]), int(x[1][6:])))
for k, g in groupby(tables, key=itemgetter(2)):
g = list(g)
tables_same_ncols = set([int(t[0][5:]) for t in g])
tables_same_ncols = sorted(list(tables_same_ncols))
for K, G in groupby(enumerate(tables_same_ncols), key=lambda (i, x): i - x):
G = list(G)
ct_pages.append((str(G[0][1]), str(G[-1][1])))
result_headers = []
for ct in ct_pages:
header_idx = {}
possible_headers = []
ncols = 0
for page_number in range(int(ct[0]), int(ct[1]) + 1):
page = data['page-{0}'.format(page_number)]
for table_number in page.keys():
table = page[table_number]
ncols = table['ncols']
for i, row in enumerate(table['data']):
try:
header_idx[tuple(row)].append(i)
except KeyError:
header_idx[tuple(row)] = [i]
possible_headers = sorted(header_idx, key=lambda k: len(header_idx[k]), reverse=True)[:10]
possible_headers = filter(lambda z: len(z) == ncols,
[filter(lambda x: x != '', p_h) for p_h in possible_headers])
modes = []
for p_h in possible_headers:
try:
modes.append((p_h, max(set(header_idx[p_h]), key=header_idx[p_h].count)))
except KeyError:
pass
header = modes[modes.index(min(modes, key=lambda x: x[1]))][0]
result_headers.append(header)
header_string = "Multi-page table headers*:\n"
header_string = ''.join([header_string, '\n'.join(['pages {0} -> {1}{2}{3}'.format(
'-'.join([cr[0][0], cr[0][1]]), '"', '","'.join(cr[1]), '"') for cr in zip(
ct_pages, result_headers)])])
avg_time = "Time taken per page: {0:.2f} seconds\n".format(
p_time / float(len(data))) if len(data) != 1 else ""
equal_ncols = "\nMulti-page tables on*: {0}\n".format(
', '.join(['-'.join(ct) for ct in ct_pages])) if len(data) != 1 else ""
stats = [len(data), p_time, avg_time, total_tables, avg_score, equal_ncols]
stat_string = ("Pages processed: {0}\nTime taken: {1:.2f} seconds\n"
"{2}Tables found: {3}\nAverage score: {4:.2f}{5}".format(*stats))
print(''.join([stat_string, header_string]))
def convert_to_html(table):
html = ''
html = ''.join([html, '<table border="1">\n'])
@ -99,23 +256,23 @@ def write_to_disk(data, f='csv', output=None, filename=None):
if f in ['csv', 'tsv']:
import csv
delimiter = ',' if f == 'csv' else '\t'
for page in sorted(data):
for table in range(len(data[page])):
dsvname = '{0}_table_{1}.{2}'.format(page, table + 1, f)
for page_number in sorted(data.keys()):
for table_number in sorted(data[page_number].keys()):
dsvname = '{0}.{1}'.format(''.join([page_number, '_', table_number]), f)
with open(os.path.join(output, dsvname), 'w') as outfile:
writer = csv.writer(
outfile, delimiter=delimiter, quoting=csv.QUOTE_ALL)
for row in data[page][table]:
for row in data[page_number][table_number]['data']:
writer.writerow(row)
elif f == 'html':
htmlname = '{}.html'.format(froot)
for page in sorted(data):
for table in range(len(data[page])):
htmlname = '{0}.html'.format(froot)
for page_number in sorted(data.keys()):
for table_number in sorted(data[page_number].keys()):
with open(os.path.join(output, htmlname), 'a') as htmlfile:
htmlfile.write(convert_to_html(data[page][table]))
htmlfile.write(convert_to_html(data[page_number][table_number]['data']))
elif f == 'json':
import json
with open(os.path.join(output, '{}.json'.format(froot)), 'w') \
with open(os.path.join(output, '{0}.json'.format(froot)), 'w') \
as jsonfile:
json.dump(data, jsonfile)
elif f == 'xlsx':
@ -123,12 +280,12 @@ def write_to_disk(data, f='csv', output=None, filename=None):
from pyexcel_xlsx import save_data
from collections import OrderedDict
xlsx_data = OrderedDict()
for page in sorted(data):
for table in range(len(data[page])):
sheet_name = '{0}_table_{1}'.format(page, table + 1)
for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
for table_number in sorted(data[page_number].keys(), key=lambda x: int(x[6:])):
sheet_name = ''.join([page_number, '_', table_number])
xlsx_data.update({sheet_name:
[row for row in data[page][table]]})
save_data(os.path.join(output, '{}.xlsx'.format(froot)), xlsx_data)
[row for row in data[page_number][table_number]['data']]})
save_data(os.path.join(output, '{0}.xlsx'.format(froot)), xlsx_data)
except ImportError:
print("link to install docs")
@ -147,16 +304,17 @@ if __name__ == '__main__':
filename = args['<file>']
filedir = os.path.dirname(args['<file>'])
logname, __ = os.path.splitext(filename)
logname += '.log'
logname = ''.join([logname, '.log'])
scorename, __ = os.path.splitext(filename)
scorename = ''.join([scorename, '_info.csv'])
pngname, __ = os.path.splitext(filename)
if args['--log']:
FORMAT = '%(asctime)s - %(levelname)s - %(message)s'
if args['--output']:
logname = os.path.join(args['--output'], os.path.basename(logname))
logging.basicConfig(
filename=logname, filemode='w', level=logging.DEBUG)
else:
logging.basicConfig(
filename=logname, filemode='w', level=logging.DEBUG)
logging.basicConfig(
filename=logname, filemode='w', format=FORMAT, level=logging.DEBUG)
p = []
if args['--pages'] == '1':
@ -173,47 +331,142 @@ if __name__ == '__main__':
else:
p.append({'start': int(r), 'end': int(r)})
margin_tuple = (float(args['--cmargin']), float(args['--lmargin']),
float(args['--wmargin']))
if args['<method>'] == 'lattice':
try:
extractor = Lattice(Pdf(filename, pagenos=p, clean=True),
fill=args['--fill'],
scale=int(args['--scale']),
jtol=int(args['--jtol']),
mtol=int(args['--mtol']),
invert=args['--invert'],
debug=args['--debug'],
verbose=args['--verbose'])
data = extractor.get_tables()
manager = Pdf(Lattice(
fill=args['--fill'],
scale=int(args['--scale']),
invert=args['--invert'],
jtol=int(args['--jtol']),
mtol=int(args['--mtol']),
pdf_margin=margin_tuple,
debug=args['--debug']),
filename,
pagenos=p,
parallel=args['--parallel'],
clean=True)
data = manager.extract()
processing_time = time.time() - start_time
vprint("Finished processing in", processing_time, "seconds")
logging.info("Finished processing in " + str(processing_time) + " seconds")
if args['--plot']:
if args['--output']:
pngname = os.path.join(args['--output'], os.path.basename(pngname))
plot_type = args['--plot'].split(',')
if 'page' in plot_type:
for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
page = data[page_number]
for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
table = page[table_number]
plot_table_barchart(table['r_nempty_cells'],
table['c_nempty_cells'],
table['empty_p'],
page_number,
table_number)
if 'all' in plot_type:
plot_all_barchart(data, pngname)
if 'rc' in plot_type:
plot_rc_piechart(data, pngname)
if args['--summary']:
summary(data, processing_time)
if args['--save-info']:
if args['--output']:
scorename = os.path.join(args['--output'], os.path.basename(scorename))
with open(scorename, 'w') as score_file:
score_file.write('table,nrows,ncols,empty_p,line_p,text_p,score\n')
for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
page = data[page_number]
for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
table = page[table_number]
score_file.write('{0},{1},{2},{3},{4},{5},{6}\n'.format(
''.join([page_number, '_', table_number]),
table['nrows'],
table['ncols'],
table['empty_p'],
table['line_p'],
table['text_p'],
table['score']))
if args['--debug']:
extractor.plot_geometry(args['--debug'])
manager.debug_plot()
except Exception as e:
logging.exception(e.message, exc_info=True)
sys.exit()
elif args['<method>'] == 'stream':
try:
extractor = Stream(Pdf(filename, pagenos=p,
char_margin=float(args['--cmargin']),
line_margin=float(args['--lmargin']),
word_margin=float(args['--wmargin']),
clean=True),
ncolumns=int(args['--ncols']),
columns=args['--columns'],
ytol=int(args['--ytol']),
debug=args['--debug'],
verbose=args['--verbose'])
data = extractor.get_tables()
manager = Pdf(Stream(
ncolumns=int(args['--ncols']),
columns=args['--columns'],
ytol=int(args['--ytol']),
mtol=int(args['--mtol']),
pdf_margin=margin_tuple,
debug=args['--debug']),
filename,
pagenos=p,
parallel=args['--parallel'],
clean=True)
data = manager.extract()
processing_time = time.time() - start_time
vprint("Finished processing in", processing_time, "seconds")
logging.info("Finished processing in " + str(processing_time) + " seconds")
if args['--plot']:
if args['--output']:
pngname = os.path.join(args['--output'], os.path.basename(pngname))
plot_type = args['--plot'].split(',')
if 'page' in plot_type:
for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
page = data[page_number]
for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
table = page[table_number]
plot_table_barchart(table['r_nempty_cells'],
table['c_nempty_cells'],
table['empty_p'],
page_number,
table_number)
if 'all' in plot_type:
plot_all_barchart(data, pngname)
if 'rc' in plot_type:
plot_rc_piechart(data, pngname)
if args['--summary']:
summary(data, processing_time)
if args['--save-info']:
if args['--output']:
scorename = os.path.join(args['--output'], os.path.basename(scorename))
with open(scorename, 'w') as score_file:
score_file.write('table,nrows,ncols,empty_p,,score\n')
for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
page = data[page_number]
for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
table = page[table_number]
score_file.write('{0},{1},{2},{3},{4}\n'.format(
''.join([page_number, '_', table_number]),
table['nrows'],
table['ncols'],
table['empty_p'],
table['score']))
if args['--debug']:
extractor.plot_text()
manager.debug_plot()
except Exception as e:
logging.exception(e.message, exc_info=True)
sys.exit()
if data is None:
if args['--debug']:
print("See 'camelot <method> -h' for various parameters you can tweak.")
else:
output = filedir if args['--output'] is None else args['--output']
write_to_disk(data, f=args['--format'],
output=output, filename=filename)
vprint("finished in", time.time() - start_time, "seconds")
logging.info("Time taken: " + str(time.time() - start_time) + " seconds")