Add various metrics to score the quality of a parse

Add various metrics to score the quality of a parse
pull/2/head
Vinayak Mehta 2016-08-30 14:52:49 +05:30 committed by GitHub
parent 43a009dab4
commit 552f9cf422
11 changed files with 1027 additions and 472 deletions

View File

@ -1,18 +1,31 @@
from __future__ import print_function from __future__ import division
import os import os
import types
import copy_reg
import logging
import cv2 import cv2
import numpy as np import numpy as np
from wand.image import Image
from .table import Table from .table import Table
from .utils import (transform, elements_bbox, detect_vertical, merge_close_values, from .utils import (transform, elements_bbox, detect_vertical, merge_close_values,
get_row_index, get_column_index, reduce_index, outline, get_row_index, get_column_index, get_score, reduce_index,
fill_spanning, remove_empty, encode_list) outline, fill_spanning, count_empty, encode_list, pdf_to_text)
__all__ = ['Lattice'] __all__ = ['Lattice']
def _reduce_method(m):
if m.im_self is None:
return getattr, (m.im_class, m.im_func.func_name)
else:
return getattr, (m.im_self, m.im_func.func_name)
copy_reg.pickle(types.MethodType, _reduce_method)
def _morph_transform(imagename, scale=15, invert=False): def _morph_transform(imagename, scale=15, invert=False):
"""Morphological Transformation """Morphological Transformation
@ -65,8 +78,8 @@ def _morph_transform(imagename, scale=15, invert=False):
vertical = threshold vertical = threshold
horizontal = threshold horizontal = threshold
verticalsize = vertical.shape[0] / scale verticalsize = vertical.shape[0] // scale
horizontalsize = horizontal.shape[1] / scale horizontalsize = horizontal.shape[1] // scale
ver = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize)) ver = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize))
hor = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontalsize, 1)) hor = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontalsize, 1))
@ -79,8 +92,12 @@ def _morph_transform(imagename, scale=15, invert=False):
mask = vertical + horizontal mask = vertical + horizontal
joints = np.bitwise_and(vertical, horizontal) joints = np.bitwise_and(vertical, horizontal)
__, contours, __ = cv2.findContours( try:
mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) __, contours, __ = cv2.findContours(
mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
except ValueError:
contours, __ = cv2.findContours(
mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10] contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
tables = {} tables = {}
@ -88,8 +105,12 @@ def _morph_transform(imagename, scale=15, invert=False):
c_poly = cv2.approxPolyDP(c, 3, True) c_poly = cv2.approxPolyDP(c, 3, True)
x, y, w, h = cv2.boundingRect(c_poly) x, y, w, h = cv2.boundingRect(c_poly)
roi = joints[y : y + h, x : x + w] roi = joints[y : y + h, x : x + w]
__, jc, __ = cv2.findContours( try:
roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE) __, jc, __ = cv2.findContours(
roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
except ValueError:
jc, __ = cv2.findContours(
roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
if len(jc) <= 4: # remove contours with less than <=4 joints if len(jc) <= 4: # remove contours with less than <=4 joints
continue continue
joint_coords = [] joint_coords = []
@ -100,16 +121,24 @@ def _morph_transform(imagename, scale=15, invert=False):
tables[(x, y + h, x + w, y)] = joint_coords tables[(x, y + h, x + w, y)] = joint_coords
v_segments, h_segments = [], [] v_segments, h_segments = [], []
_, vcontours, _ = cv2.findContours( try:
vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) _, vcontours, _ = cv2.findContours(
vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
except ValueError:
vcontours, _ = cv2.findContours(
vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
for vc in vcontours: for vc in vcontours:
x, y, w, h = cv2.boundingRect(vc) x, y, w, h = cv2.boundingRect(vc)
x1, x2 = x, x + w x1, x2 = x, x + w
y1, y2 = y, y + h y1, y2 = y, y + h
v_segments.append(((x1 + x2) / 2, y2, (x1 + x2) / 2, y1)) v_segments.append(((x1 + x2) / 2, y2, (x1 + x2) / 2, y1))
_, hcontours, _ = cv2.findContours( try:
horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) _, hcontours, _ = cv2.findContours(
horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
except ValueError:
hcontours, _ = cv2.findContours(
horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
for hc in hcontours: for hc in hcontours:
x, y, w, h = cv2.boundingRect(hc) x, y, w, h = cv2.boundingRect(hc)
x1, x2 = x, x + w x1, x2 = x, x + w
@ -160,24 +189,19 @@ class Lattice:
page as value. page as value.
""" """
def __init__(self, pdfobject, fill=None, scale=15, jtol=2, mtol=2, def __init__(self, fill=None, scale=15, jtol=2, mtol=2,
invert=False, debug=None, verbose=False): invert=False, pdf_margin=(2.0, 0.5, 0.1), debug=None):
self.pdfobject = pdfobject self.method = 'lattice'
self.fill = fill self.fill = fill
self.scale = scale self.scale = scale
self.jtol = jtol self.jtol = jtol
self.mtol = mtol self.mtol = mtol
self.invert = invert self.invert = invert
self.char_margin, self.line_margin, self.word_margin = pdf_margin
self.debug = debug self.debug = debug
self.verbose = verbose
self.tables = {}
if self.debug is not None:
self.debug_images = {}
self.debug_segments = {}
self.debug_tables = {}
def get_tables(self): def get_tables(self, pdfname):
"""Returns all tables found in given pdf. """Returns all tables found in given pdf.
Returns Returns
@ -186,169 +210,124 @@ class Lattice:
Dictionary with page number as key and list of tables on that Dictionary with page number as key and list of tables on that
page as value. page as value.
""" """
vprint = print if self.verbose else lambda *a, **k: None text, __, width, height = pdf_to_text(pdfname, self.char_margin,
self.pdfobject.split() self.line_margin, self.word_margin)
self.pdfobject.convert() bname, __ = os.path.splitext(pdfname)
for page in self.pdfobject.extract(): if not text:
p, text, __, width, height = page logging.warning("{0}: PDF has no text. It may be an image.".format(
pkey = 'pg-{0}'.format(p) os.path.basename(bname)))
imagename = os.path.join( return None
self.pdfobject.temp, '{}.png'.format(pkey)) imagename = ''.join([bname, '.png'])
pdf_x = width with Image(filename=pdfname, depth=8, resolution=300) as png:
pdf_y = height png.save(filename=imagename)
img, table_bbox, v_segments, h_segments = _morph_transform( pdf_x = width
imagename, scale=self.scale, invert=self.invert) pdf_y = height
img_x = img.shape[1] img, table_bbox, v_segments, h_segments = _morph_transform(
img_y = img.shape[0] imagename, scale=self.scale, invert=self.invert)
scaling_factor_x = pdf_x / float(img_x) img_x = img.shape[1]
scaling_factor_y = pdf_y / float(img_y) img_y = img.shape[0]
scaling_factor_x = pdf_x / float(img_x)
scaling_factor_y = pdf_y / float(img_y)
if self.debug is not None: if self.debug:
self.debug_images[pkey] = (img, table_bbox) self.debug_images = (img, table_bbox)
factors = (scaling_factor_x, scaling_factor_y, img_y) factors = (scaling_factor_x, scaling_factor_y, img_y)
table_bbox, v_segments, h_segments = transform(table_bbox, v_segments, table_bbox, v_segments, h_segments = transform(table_bbox, v_segments,
h_segments, factors) h_segments, factors)
if self.debug is not None: if self.debug:
self.debug_segments[pkey] = (v_segments, h_segments) self.debug_segments = (v_segments, h_segments)
self.debug_tables = []
if self.debug is not None: pdf_page = {}
debug_page_tables = [] page_tables = {}
page_tables = [] table_no = 1
# sort tables based on y-coord # sort tables based on y-coord
for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True): for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True):
# select edges which lie within table_bbox # select edges which lie within table_bbox
text_bbox, v_s, h_s = elements_bbox(k, text, v_segments, table_info = {}
h_segments) text_bbox, v_s, h_s = elements_bbox(k, text, v_segments,
rotated = detect_vertical(text_bbox) h_segments)
cols, rows = zip(*table_bbox[k]) table_info['text_p'] = 100 * (1 - (len(text_bbox) / len(text)))
cols, rows = list(cols), list(rows) rotated = detect_vertical(text_bbox)
cols.extend([k[0], k[2]]) cols, rows = zip(*table_bbox[k])
rows.extend([k[1], k[3]]) cols, rows = list(cols), list(rows)
# sort horizontal and vertical segments cols.extend([k[0], k[2]])
cols = merge_close_values(sorted(cols), mtol=self.mtol) rows.extend([k[1], k[3]])
rows = merge_close_values( # sort horizontal and vertical segments
sorted(rows, reverse=True), mtol=self.mtol) cols = merge_close_values(sorted(cols), mtol=self.mtol)
# make grid using x and y coord of shortlisted rows and cols rows = merge_close_values(
cols = [(cols[i], cols[i + 1]) sorted(rows, reverse=True), mtol=self.mtol)
for i in range(0, len(cols) - 1)] # make grid using x and y coord of shortlisted rows and cols
rows = [(rows[i], rows[i + 1]) cols = [(cols[i], cols[i + 1])
for i in range(0, len(rows) - 1)] for i in range(0, len(cols) - 1)]
table = Table(cols, rows) rows = [(rows[i], rows[i + 1])
# set table edges to True using ver+hor lines for i in range(0, len(rows) - 1)]
table = table.set_edges(v_s, h_s, jtol=self.jtol) table = Table(cols, rows)
# set spanning cells to True # set table edges to True using ver+hor lines
table = table.set_spanning() table = table.set_edges(v_s, h_s, jtol=self.jtol)
# set table border edges to True nouse = table.nocont_ / (len(v_s) + len(h_s))
table = outline(table) table_info['line_p'] = 100 * (1 - nouse)
# set spanning cells to True
table = table.set_spanning()
# set table border edges to True
table = outline(table)
if self.debug is not None: if self.debug:
debug_page_tables.append(table) self.debug_tables.append(table)
# fill text after sorting it # fill text after sorting it
if rotated == '': if rotated == '':
text_bbox.sort(key=lambda x: (-x.y0, x.x0)) text_bbox.sort(key=lambda x: (-x.y0, x.x0))
elif rotated == 'left': elif rotated == 'left':
text_bbox.sort(key=lambda x: (x.x0, x.y0)) text_bbox.sort(key=lambda x: (x.x0, x.y0))
elif rotated == 'right': elif rotated == 'right':
text_bbox.sort(key=lambda x: (-x.x0, -x.y0)) text_bbox.sort(key=lambda x: (-x.x0, -x.y0))
for t in text_bbox:
r_idx = get_row_index(t, rows)
c_idx = get_column_index(t, cols)
if None in [r_idx, c_idx]:
# couldn't assign LTChar to any cell
pass
else:
r_idx, c_idx = reduce_index(
table, rotated, r_idx, c_idx)
table.cells[r_idx][c_idx].add_text(
t.get_text().strip('\n'))
if self.fill is not None: rerror = []
table = fill_spanning(table, fill=self.fill) cerror = []
ar = table.get_list() for t in text_bbox:
if rotated == 'left': try:
ar = zip(*ar[::-1]) r_idx, rass_error = get_row_index(t, rows)
elif rotated == 'right': except TypeError:
ar = zip(*ar[::1]) # couldn't assign LTChar to any cell
ar.reverse() continue
ar = remove_empty(ar) try:
ar = [list(o) for o in ar] c_idx, cass_error = get_column_index(t, cols)
page_tables.append(encode_list(ar)) except TypeError:
vprint(pkey) # couldn't assign LTChar to any cell
self.tables[pkey] = page_tables continue
rerror.append(rass_error)
cerror.append(cass_error)
r_idx, c_idx = reduce_index(
table, rotated, r_idx, c_idx)
table.cells[r_idx][c_idx].add_text(
t.get_text().strip('\n'))
score = get_score([[50, rerror], [50, cerror]])
table_info['score'] = score
if self.debug is not None: if self.fill is not None:
self.debug_tables[pkey] = debug_page_tables table = fill_spanning(table, fill=self.fill)
ar = table.get_list()
if rotated == 'left':
ar = zip(*ar[::-1])
elif rotated == 'right':
ar = zip(*ar[::1])
ar.reverse()
ar = encode_list(ar)
table_info['data'] = ar
empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)
table_info['empty_p'] = empty_p
table_info['r_nempty_cells'] = r_nempty_cells
table_info['c_nempty_cells'] = c_nempty_cells
table_info['nrows'] = len(ar)
table_info['ncols'] = len(ar[0])
page_tables['table_{0}'.format(table_no)] = table_info
table_no += 1
pdf_page[os.path.basename(bname)] = page_tables
if self.pdfobject.clean: if self.debug:
self.pdfobject.remove_tempdir()
if self.debug is not None:
return None return None
return self.tables return pdf_page
def plot_geometry(self, geometry):
"""Plots various pdf geometries that are detected so user can choose
tweak scale, jtol, mtol parameters.
"""
import matplotlib.pyplot as plt
if geometry == 'contour':
for pkey in self.debug_images.keys():
img, table_bbox = self.debug_images[pkey]
for t in table_bbox.keys():
cv2.rectangle(img, (t[0], t[1]),
(t[2], t[3]), (255, 0, 0), 3)
plt.imshow(img)
plt.show()
elif geometry == 'joint':
x_coord = []
y_coord = []
for pkey in self.debug_images.keys():
img, table_bbox = self.debug_images[pkey]
for k in table_bbox.keys():
for coord in table_bbox[k]:
x_coord.append(coord[0])
y_coord.append(coord[1])
max_x, max_y = max(x_coord), max(y_coord)
plt.plot(x_coord, y_coord, 'ro')
plt.axis([0, max_x + 100, max_y + 100, 0])
plt.imshow(img)
plt.show()
elif geometry == 'line':
for pkey in self.debug_segments.keys():
v_s, h_s = self.debug_segments[pkey]
for v in v_s:
plt.plot([v[0], v[2]], [v[1], v[3]])
for h in h_s:
plt.plot([h[0], h[2]], [h[1], h[3]])
plt.show()
elif geometry == 'table':
for pkey in self.debug_tables.keys():
for table in self.debug_tables[pkey]:
for i in range(len(table.cells)):
for j in range(len(table.cells[i])):
if table.cells[i][j].left:
plt.plot([table.cells[i][j].lb[0],
table.cells[i][j].lt[0]],
[table.cells[i][j].lb[1],
table.cells[i][j].lt[1]])
if table.cells[i][j].right:
plt.plot([table.cells[i][j].rb[0],
table.cells[i][j].rt[0]],
[table.cells[i][j].rb[1],
table.cells[i][j].rt[1]])
if table.cells[i][j].top:
plt.plot([table.cells[i][j].lt[0],
table.cells[i][j].rt[0]],
[table.cells[i][j].lt[1],
table.cells[i][j].rt[1]])
if table.cells[i][j].bottom:
plt.plot([table.cells[i][j].lb[0],
table.cells[i][j].rb[0]],
[table.cells[i][j].lb[1],
table.cells[i][j].rb[1]])
plt.show()

View File

@ -1,18 +1,11 @@
import os import os
import shutil import shutil
import tempfile import tempfile
import itertools
import multiprocessing as mp
import cv2
from PyPDF2 import PdfFileReader, PdfFileWriter from PyPDF2 import PdfFileReader, PdfFileWriter
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTChar, LTTextLineHorizontal
from wand.image import Image
__all__ = ['Pdf'] __all__ = ['Pdf']
@ -38,38 +31,6 @@ def _parse_page_numbers(pagenos):
return page_numbers return page_numbers
def _extract_text_objects(layout, LTObject, t=None):
"""Recursively parses pdf layout to get a list of
text objects.
Parameters
----------
layout : object
Layout object.
LTObject : object
Text object, either LTChar or LTTextLineHorizontal.
t : list (optional, default: None)
Returns
-------
t : list
List of text objects.
"""
if t is None:
t = []
try:
for obj in layout._objs:
if isinstance(obj, LTObject):
t.append(obj)
else:
t += _extract_text_objects(obj, LTObject)
except AttributeError:
pass
return t
class Pdf: class Pdf:
"""Handles all pdf operations which include: """Handles all pdf operations which include:
@ -99,66 +60,163 @@ class Pdf:
is greater than word_margin. (optional, default: 0.1) is greater than word_margin. (optional, default: 0.1)
""" """
def __init__(self, pdfname, pagenos=[{'start': 1, 'end': 1}], def __init__(self, extractor, pdfname, pagenos=[{'start': 1, 'end': 1}],
char_margin=2.0, line_margin=0.5, word_margin=0.1, parallel=False, clean=False):
clean=False):
self.extractor = extractor
self.pdfname = pdfname self.pdfname = pdfname
if not self.pdfname.endswith('.pdf'):
raise TypeError("Only PDF format is supported right now.")
self.pagenos = _parse_page_numbers(pagenos) self.pagenos = _parse_page_numbers(pagenos)
self.char_margin = char_margin self.parallel = parallel
self.line_margin = line_margin self.cpu_count = mp.cpu_count()
self.word_margin = word_margin self.pool = mp.Pool(processes=self.cpu_count)
self.clean = clean self.clean = clean
self.temp = tempfile.mkdtemp() self.temp = tempfile.mkdtemp()
def split(self): def split(self):
"""Splits pdf into single page pdfs. """Splits pdf into single page pdfs.
""" """
if not self.pdfname.endswith('.pdf'):
raise TypeError("Only PDF format is supported.")
infile = PdfFileReader(open(self.pdfname, 'rb'), strict=False) infile = PdfFileReader(open(self.pdfname, 'rb'), strict=False)
for p in self.pagenos: for p in self.pagenos:
page = infile.getPage(p - 1) page = infile.getPage(p - 1)
outfile = PdfFileWriter() outfile = PdfFileWriter()
outfile.addPage(page) outfile.addPage(page)
with open(os.path.join(self.temp, 'pg-{0}.pdf'.format(p)), 'wb') as f: with open(os.path.join(self.temp, 'page-{0}.pdf'.format(p)), 'wb') as f:
outfile.write(f) outfile.write(f)
def remove_tempdir(self):
shutil.rmtree(self.temp)
def extract(self): def extract(self):
"""Extracts text objects, width, height from a pdf. """Extracts text objects, width, height from a pdf.
""" """
for p in self.pagenos: self.split()
pkey = 'pg-{0}'.format(p) pages = [os.path.join(self.temp, 'page-{0}.pdf'.format(p))
pname = os.path.join(self.temp, '{}.pdf'.format(pkey)) for p in self.pagenos]
with open(pname, 'r') as f: if self.parallel:
parser = PDFParser(f) tables = self.pool.map(self.extractor.get_tables, pages)
document = PDFDocument(parser) tables = {k: v for d in tables if d is not None for k, v in d.items()}
if not document.is_extractable: else:
raise PDFTextExtractionNotAllowed tables = {}
laparams = LAParams(char_margin=self.char_margin, if self.extractor.debug:
line_margin=self.line_margin, if self.extractor.method == 'stream':
word_margin=self.word_margin) self.debug = self.extractor.debug
rsrcmgr = PDFResourceManager() self.debug_text = []
device = PDFPageAggregator(rsrcmgr, laparams=laparams) elif self.extractor.method == 'lattice':
interpreter = PDFPageInterpreter(rsrcmgr, device) self.debug = self.extractor.debug
for page in PDFPage.create_pages(document): self.debug_images = []
interpreter.process_page(page) self.debug_segments = []
layout = device.get_result() self.debug_tables = []
lattice_objects = _extract_text_objects(layout, LTChar) for p in pages:
stream_objects = _extract_text_objects( table = self.extractor.get_tables(p)
layout, LTTextLineHorizontal) if table is not None:
width = layout.bbox[2] tables.update(table)
height = layout.bbox[3] if self.extractor.debug:
yield p, lattice_objects, stream_objects, width, height if self.extractor.method == 'stream':
self.debug_text.append(self.extractor.debug_text)
elif self.extractor.method == 'lattice':
self.debug_images.append(self.extractor.debug_images)
self.debug_segments.append(self.extractor.debug_segments)
self.debug_tables.append(self.extractor.debug_tables)
if self.clean:
self.remove_tempdir()
return tables
def convert(self): def debug_plot(self):
"""Converts single page pdfs to images. """Plots all text objects and various pdf geometries so that
user can choose number of columns, columns x-coordinates for
Stream or tweak Lattice parameters (scale, jtol, mtol).
""" """
for p in self.pagenos: import matplotlib.pyplot as plt
pdfname = os.path.join(self.temp, 'pg-{0}.pdf'.format(p)) import matplotlib.patches as patches
imagename = os.path.join(self.temp, 'pg-{0}.png'.format(p))
with Image(filename=pdfname, depth=8, resolution=300) as png:
png.save(filename=imagename)
def remove_tempdir(self): if self.debug is True:
shutil.rmtree(self.temp) try:
for text in self.debug_text:
fig = plt.figure()
ax = fig.add_subplot(111, aspect='equal')
xs, ys = [], []
for t in text:
xs.extend([t[0], t[1]])
ys.extend([t[2], t[3]])
ax.add_patch(
patches.Rectangle(
(t[0], t[1]),
t[2] - t[0],
t[3] - t[1]
)
)
ax.set_xlim(min(xs) - 10, max(xs) + 10)
ax.set_ylim(min(ys) - 10, max(ys) + 10)
plt.show()
except AttributeError:
raise ValueError("This option only be used with Stream.")
elif self.debug == 'contour':
try:
for img, table_bbox in self.debug_images:
for t in table_bbox.keys():
cv2.rectangle(img, (t[0], t[1]),
(t[2], t[3]), (255, 0, 0), 3)
plt.imshow(img)
plt.show()
except AttributeError:
raise ValueError("This option only be used with Lattice.")
elif self.debug == 'joint':
try:
for img, table_bbox in self.debug_images:
x_coord = []
y_coord = []
for k in table_bbox.keys():
for coord in table_bbox[k]:
x_coord.append(coord[0])
y_coord.append(coord[1])
max_x, max_y = max(x_coord), max(y_coord)
plt.plot(x_coord, y_coord, 'ro')
plt.axis([0, max_x + 100, max_y + 100, 0])
plt.imshow(img)
plt.show()
except AttributeError:
raise ValueError("This option only be used with Lattice.")
elif self.debug == 'line':
try:
for v_s, h_s in self.debug_segments:
for v in v_s:
plt.plot([v[0], v[2]], [v[1], v[3]])
for h in h_s:
plt.plot([h[0], h[2]], [h[1], h[3]])
plt.show()
except AttributeError:
raise ValueError("This option only be used with Lattice.")
elif self.debug == 'table':
try:
for tables in self.debug_tables:
for table in tables:
for i in range(len(table.cells)):
for j in range(len(table.cells[i])):
if table.cells[i][j].left:
plt.plot([table.cells[i][j].lb[0],
table.cells[i][j].lt[0]],
[table.cells[i][j].lb[1],
table.cells[i][j].lt[1]])
if table.cells[i][j].right:
plt.plot([table.cells[i][j].rb[0],
table.cells[i][j].rt[0]],
[table.cells[i][j].rb[1],
table.cells[i][j].rt[1]])
if table.cells[i][j].top:
plt.plot([table.cells[i][j].lt[0],
table.cells[i][j].rt[0]],
[table.cells[i][j].lt[1],
table.cells[i][j].rt[1]])
if table.cells[i][j].bottom:
plt.plot([table.cells[i][j].lb[0],
table.cells[i][j].rb[0]],
[table.cells[i][j].lb[1],
table.cells[i][j].rb[1]])
plt.show()
except AttributeError:
raise ValueError("This option only be used with Lattice.")
else:
raise UserWarning("This method can only be called after"
" debug has been specified.")

View File

@ -1,14 +1,26 @@
from __future__ import print_function from __future__ import division
import os import os
import types
import copy_reg
import logging
import numpy as np import numpy as np
from .utils import get_column_index, encode_list from .table import Table
from .utils import get_row_index, get_score, count_empty, encode_list, pdf_to_text
__all__ = ['Stream'] __all__ = ['Stream']
def _reduce_method(m):
if m.im_self is None:
return getattr, (m.im_class, m.im_func.func_name)
else:
return getattr, (m.im_self, m.im_func.func_name)
copy_reg.pickle(types.MethodType, _reduce_method)
def _group_rows(text, ytol=2): def _group_rows(text, ytol=2):
"""Groups text objects into rows using ytol. """Groups text objects into rows using ytol.
@ -35,14 +47,16 @@ def _group_rows(text, ytol=2):
# type(obj) is LTChar]): # type(obj) is LTChar]):
if t.get_text().strip(): if t.get_text().strip():
if not np.isclose(row_y, t.y0, atol=ytol): if not np.isclose(row_y, t.y0, atol=ytol):
row_y = t.y0 rows.append(sorted(temp, key=lambda t: t.x0))
rows.append(temp)
temp = [] temp = []
row_y = t.y0
temp.append(t) temp.append(t)
rows.append(sorted(temp, key=lambda t: t.x0))
__ = rows.pop(0) # hacky
return rows return rows
def _merge_columns(l): def _merge_columns(l, mtol=2):
"""Merges overlapping columns and returns list with updated """Merges overlapping columns and returns list with updated
columns boundaries. columns boundaries.
@ -62,7 +76,8 @@ def _merge_columns(l):
merged.append(higher) merged.append(higher)
else: else:
lower = merged[-1] lower = merged[-1]
if higher[0] <= lower[1]: if (higher[0] <= lower[1] or
np.isclose(higher[0], lower[1], atol=mtol)):
upper_bound = max(lower[1], higher[1]) upper_bound = max(lower[1], higher[1])
lower_bound = min(lower[0], higher[0]) lower_bound = min(lower[0], higher[0])
merged[-1] = (lower_bound, upper_bound) merged[-1] = (lower_bound, upper_bound)
@ -71,6 +86,62 @@ def _merge_columns(l):
return merged return merged
def _get_column_index(t, columns):
"""Gets index of the column in which the given object falls by
comparing their co-ordinates.
Parameters
----------
t : object
columns : list
Returns
-------
c : int
"""
offset1, offset2 = 0, 0
lt_col_overlap = []
for c in columns:
if c[0] <= t.x1 and c[1] >= t.x0:
left = t.x0 if c[0] <= t.x0 else c[0]
right = t.x1 if c[1] >= t.x1 else c[1]
lt_col_overlap.append(abs(left - right) / abs(c[0] - c[1]))
else:
lt_col_overlap.append(-1)
if len(filter(lambda x: x != -1, lt_col_overlap)) == 0:
logging.warning("Text doesn't fit any column.")
c_idx = lt_col_overlap.index(max(lt_col_overlap))
if t.x0 < columns[c_idx][0]:
offset1 = abs(t.x0 - columns[c_idx][0])
if t.x1 > columns[c_idx][1]:
offset2 = abs(t.x1 - columns[c_idx][1])
Y = abs(t.y0 - t.y1)
charea = abs(t.x0 - t.x1) * abs(t.y0 - t.y1)
error = (Y * (offset1 + offset2)) / charea
return c_idx, error
def _add_columns(cols, text, ytolerance):
if text:
text = _group_rows(text, ytol=ytolerance)
elements = [len(r) for r in text]
new_cols = [(t.x0, t.x1)
for r in text if len(r) == max(elements) for t in r]
cols.extend(_merge_columns(sorted(new_cols)))
return cols
def _join_columns(cols, width):
cols = sorted(cols)
cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))]
cols.insert(0, 0)
cols.append(width) # or some tolerance
cols = [(cols[i], cols[i + 1])
for i in range(0, len(cols) - 1)]
return cols
class Stream: class Stream:
"""Stream algorithm """Stream algorithm
@ -105,20 +176,18 @@ class Stream:
page as value. page as value.
""" """
def __init__(self, pdfobject, ncolumns=0, columns=None, ytol=2, def __init__(self, ncolumns=0, columns=None, ytol=2, mtol=2,
debug=False, verbose=False): pdf_margin=(2.0, 0.5, 0.1), debug=False):
self.pdfobject = pdfobject self.method = 'stream'
self.ncolumns = ncolumns self.ncolumns = ncolumns
self.columns = columns self.columns = columns
self.ytol = ytol self.ytol = ytol
self.mtol = mtol
self.char_margin, self.line_margin, self.word_margin = pdf_margin
self.debug = debug self.debug = debug
self.verbose = verbose
self.tables = {}
if self.debug:
self.debug_text = {}
def get_tables(self): def get_tables(self, pdfname):
"""Returns all tables found in given pdf. """Returns all tables found in given pdf.
Returns Returns
@ -127,86 +196,112 @@ class Stream:
Dictionary with page number as key and list of tables on that Dictionary with page number as key and list of tables on that
page as value. page as value.
""" """
vprint = print if self.verbose else lambda *a, **k: None __, text, width, height = pdf_to_text(pdfname, self.char_margin,
self.pdfobject.split() self.line_margin, self.word_margin)
for page in self.pdfobject.extract(): bname, __ = os.path.splitext(pdfname)
p, __, text, __, __ = page if not text:
pkey = 'pg-{0}'.format(p) logging.warning("{0}: PDF has no text. It may be an image.".format(
text.sort(key=lambda x: (-x.y0, x.x0)) os.path.basename(bname)))
return None
if self.debug: text.sort(key=lambda x: (-x.y0, x.x0))
self.debug_text[pkey] = text
rows = _group_rows(text, ytol=self.ytol)
elements = [len(r) for r in rows]
# a table can't have just 1 column, can it?
elements = filter(lambda x: x != 1, elements)
guess = False
if self.columns:
cols = self.columns.split(',')
cols = [(float(cols[i]), float(cols[i + 1]))
for i in range(0, len(cols) - 1)]
else:
guess = True
ncols = self.ncolumns if self.ncolumns else max(
set(elements), key=elements.count)
if ncols == 0:
# no tables detected
continue
cols = [(t.x0, t.x1)
for r in rows for t in r if len(r) == ncols]
cols = _merge_columns(sorted(cols))
cols = [(c[0] + c[1]) / 2.0 for c in cols]
ar = [['' for c in cols] for r in rows]
for r_idx, r in enumerate(rows):
for t in r:
if guess:
cog = (t.x0 + t.x1) / 2.0
diff = [abs(cog - c) for c in cols]
c_idx = diff.index(min(diff))
else:
c_idx = get_column_index(t, cols)
if None in [r_idx, c_idx]: # couldn't assign LTTextLH to any cell
continue
if ar[r_idx][c_idx]:
ar[r_idx][c_idx] = ' '.join(
[ar[r_idx][c_idx], t.get_text().strip()])
else:
ar[r_idx][c_idx] = t.get_text().strip()
vprint(pkey)
self.tables[pkey] = [encode_list(ar)]
if self.pdfobject.clean:
self.pdfobject.remove_tempdir()
if self.debug: if self.debug:
self.debug_text = [(t.x0, t.y0, t.x1, t.y1) for t in text]
return None return None
return self.tables rows_grouped = _group_rows(text, ytol=self.ytol)
elements = [len(r) for r in rows_grouped]
row_mids = [sum([(t.y0 + t.y1) / 2 for t in r]) / len(r)
if len(r) > 0 else 0 for r in rows_grouped]
rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))]
rows.insert(0, height) # or some tolerance
rows.append(0)
rows = [(rows[i], rows[i + 1])
for i in range(0, len(rows) - 1)]
def plot_text(self): guess = False
"""Plots all text objects so user can choose number of columns if self.columns:
or columns x-coordinates using the matplotlib interface. # user has to input boundary columns too
""" # take (0, width) by default
import matplotlib.pyplot as plt # similar to else condition
import matplotlib.patches as patches # len can't be 1
cols = self.columns.split(',')
cols = [(float(cols[i]), float(cols[i + 1]))
for i in range(0, len(cols) - 1)]
else:
if self.ncolumns:
ncols = self.ncolumns
cols = [(t.x0, t.x1)
for r in rows_grouped if len(r) == ncols for t in r]
cols = _merge_columns(sorted(cols), mtol=self.mtol)
if len(cols) != self.ncolumns:
logging.warning("{}: The number of columns after merge"
" isn't the same as what you specified."
" Change the value of mtol.".format(
os.path.basename(bname)))
cols = _join_columns(cols, width)
else:
guess = True
ncols = max(set(elements), key=elements.count)
len_non_mode = len(filter(lambda x: x != ncols, elements))
if ncols == 1 and not self.debug:
# no tables detected
logging.warning("{}: Only one column was detected, the PDF"
" may have no tables. Specify ncols if"
" the PDF has tables.".format(
os.path.basename(bname)))
cols = [(t.x0, t.x1)
for r in rows_grouped if len(r) == ncols for t in r]
cols = _merge_columns(sorted(cols), mtol=self.mtol)
inner_text = []
for i in range(1, len(cols)):
left = cols[i - 1][1]
right = cols[i][0]
inner_text.extend([t for t in text if t.x0 > left and t.x1 < right])
outer_text = [t for t in text if t.x0 > cols[-1][1] or t.x1 < cols[0][0]]
inner_text.extend(outer_text)
cols = _add_columns(cols, inner_text, self.ytol)
cols = _join_columns(cols, width)
for pkey in sorted(self.debug_text.keys()): pdf_page = {}
fig = plt.figure() page_tables = {}
ax = fig.add_subplot(111, aspect='equal') table_info = {}
xs, ys = [], [] table = Table(cols, rows)
for t in self.debug_text[pkey]: rerror = []
xs.extend([t.x0, t.x1]) cerror = []
ys.extend([t.y0, t.y1]) for row in rows_grouped:
ax.add_patch( for t in row:
patches.Rectangle( try:
(t.x0, t.y0), r_idx, rass_error = get_row_index(t, rows)
t.x1 - t.x0, except ValueError as e:
t.y1 - t.y0 # couldn't assign LTTextLH to any cell
) vprint(e.message)
) continue
ax.set_xlim(min(xs) - 10, max(xs) + 10) try:
ax.set_ylim(min(ys) - 10, max(ys) + 10) c_idx, cass_error = _get_column_index(t, cols)
plt.show() except ValueError as e:
# couldn't assign LTTextLH to any cell
vprint(e.message)
continue
rerror.append(rass_error)
cerror.append(cass_error)
table.cells[r_idx][c_idx].add_text(
t.get_text().strip('\n'))
if guess:
score = get_score([[33, rerror], [33, cerror], [34, [len_non_mode / len(elements)]]])
else:
score = get_score([[50, rerror], [50, cerror]])
table_info['score'] = score
ar = table.get_list()
ar = encode_list(ar)
table_info['data'] = ar
empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)
table_info['empty_p'] = empty_p
table_info['r_nempty_cells'] = r_nempty_cells
table_info['c_nempty_cells'] = c_nempty_cells
table_info['nrows'] = len(ar)
table_info['ncols'] = len(ar[0])
page_tables['table_1'] = table_info
pdf_page[os.path.basename(bname)] = page_tables
return pdf_page

View File

@ -26,6 +26,7 @@ class Table:
self.rows = rows self.rows = rows
self.cells = [[Cell(c[0], r[1], c[1], r[0]) self.cells = [[Cell(c[0], r[1], c[1], r[0])
for c in cols] for r in rows] for c in cols] for r in rows]
self.nocont_ = 0
def set_edges(self, vertical, horizontal, jtol=2): def set_edges(self, vertical, horizontal, jtol=2):
"""Sets cell edges to True if corresponding line segments """Sets cell edges to True if corresponding line segments
@ -53,6 +54,7 @@ class Table:
k = [k for k, t in enumerate(self.rows) k = [k for k, t in enumerate(self.rows)
if np.isclose(v[1], t[0], atol=jtol)] if np.isclose(v[1], t[0], atol=jtol)]
if not j: if not j:
self.nocont_ += 1
continue continue
J = j[0] J = j[0]
if i == [0]: # only left edge if i == [0]: # only left edge
@ -104,6 +106,7 @@ class Table:
k = [k for k, t in enumerate(self.cols) k = [k for k, t in enumerate(self.cols)
if np.isclose(h[2], t[0], atol=jtol)] if np.isclose(h[2], t[0], atol=jtol)]
if not j: if not j:
self.nocont_ += 1
continue continue
J = j[0] J = j[0]
if i == [0]: # only top edge if i == [0]: # only top edge

View File

@ -1,5 +1,18 @@
from __future__ import division
import os
import numpy as np import numpy as np
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTChar, LTTextLineHorizontal
def translate(x1, x2): def translate(x1, x2):
"""Translates x2 by x1. """Translates x2 by x1.
@ -243,15 +256,24 @@ def get_row_index(t, rows):
---------- ----------
t : object t : object
rows : list rows : list, sorted in decreasing order
Returns Returns
------- -------
r : int r : int
""" """
offset1, offset2 = 0, 0
for r in range(len(rows)): for r in range(len(rows)):
if (t.y0 + t.y1) / 2.0 < rows[r][0] and (t.y0 + t.y1) / 2.0 > rows[r][1]: if (t.y0 + t.y1) / 2.0 < rows[r][0] and (t.y0 + t.y1) / 2.0 > rows[r][1]:
return r if t.y0 > rows[r][0]:
offset1 = abs(t.y0 - rows[r][0])
if t.y1 < rows[r][1]:
offset2 = abs(t.y1 - rows[r][1])
X = 1.0 if abs(t.x0 - t.x1) == 0.0 else abs(t.x0 - t.x1)
Y = 1.0 if abs(t.y0 - t.y1) == 0.0 else abs(t.y0 - t.y1)
charea = X * Y
error = (X * (offset1 + offset2)) / charea
return r, error
def get_column_index(t, columns): def get_column_index(t, columns):
@ -268,9 +290,45 @@ def get_column_index(t, columns):
------- -------
c : int c : int
""" """
offset1, offset2 = 0, 0
for c in range(len(columns)): for c in range(len(columns)):
if (t.x0 + t.x1) / 2.0 > columns[c][0] and (t.x0 + t.x1) / 2.0 < columns[c][1]: if (t.x0 + t.x1) / 2.0 > columns[c][0] and (t.x0 + t.x1) / 2.0 < columns[c][1]:
return c if t.x0 < columns[c][0]:
offset1 = abs(t.x0 - columns[c][0])
if t.x1 > columns[c][1]:
offset2 = abs(t.x1 - columns[c][1])
X = 1.0 if abs(t.x0 - t.x1) == 0.0 else abs(t.x0 - t.x1)
Y = 1.0 if abs(t.y0 - t.y1) == 0.0 else abs(t.y0 - t.y1)
charea = X * Y
error = (Y * (offset1 + offset2)) / charea
return c, error
def get_score(error_weights):
"""Calculates score based on weights assigned to various parameters,
and their error percentages.
Parameters
----------
error_weights : dict
Dict with a tuple of error percentages as key and weightage
assigned to them as value. Sum of all values should be equal
to 100.
Returns
-------
score : float
"""
SCORE_VAL = 100
score = 0
if sum([ew[0] for ew in error_weights]) != SCORE_VAL:
raise ValueError("Please assign a valid weightage to each parameter"
" such that their sum is equal to 100")
for ew in error_weights:
weight = ew[0] / len(ew[1])
for error_percentage in ew[1]:
score += weight * (1 - error_percentage)
return score
def reduce_index(t, rotated, r_idx, c_idx): def reduce_index(t, rotated, r_idx, c_idx):
@ -394,6 +452,110 @@ def remove_empty(d):
return d return d
def count_empty(d):
"""Counts empty rows and columns from list of lists.
Parameters
----------
d : list
Returns
-------
n_empty_rows : number of empty rows
n_empty_cols : number of empty columns
empty_p : percentage of empty cells
"""
empty_p = 0
r_nempty_cells, c_nempty_cells = [], []
for i in d:
for j in i:
if j.strip() == '':
empty_p += 1
empty_p = 100 * (empty_p / float(len(d) * len(d[0])))
for row in d:
r_nempty_c = 0
for r in row:
if r.strip() != '':
r_nempty_c += 1
r_nempty_cells.append(r_nempty_c)
d = zip(*d)
d = [list(col) for col in d]
for col in d:
c_nempty_c = 0
for c in col:
if c.strip() != '':
c_nempty_c += 1
c_nempty_cells.append(c_nempty_c)
return empty_p, r_nempty_cells, c_nempty_cells
def encode_list(ar): def encode_list(ar):
"""Encodes list of text.
Parameters
----------
ar : list
Returns
-------
ar : list
"""
ar = [[r.encode('utf-8') for r in row] for row in ar] ar = [[r.encode('utf-8') for r in row] for row in ar]
return ar return ar
def extract_text_objects(layout, LTObject, t=None):
"""Recursively parses pdf layout to get a list of
text objects.
Parameters
----------
layout : object
Layout object.
LTObject : object
Text object, either LTChar or LTTextLineHorizontal.
t : list (optional, default: None)
Returns
-------
t : list
List of text objects.
"""
if t is None:
t = []
try:
for obj in layout._objs:
if isinstance(obj, LTObject):
t.append(obj)
else:
t += extract_text_objects(obj, LTObject)
except AttributeError:
pass
return t
def pdf_to_text(pname, char_margin, line_margin, word_margin):
# pkey = 'page-{0}'.format(p)
# pname = os.path.join(self.temp, '{}.pdf'.format(pkey))
with open(pname, 'r') as f:
parser = PDFParser(f)
document = PDFDocument(parser)
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
laparams = LAParams(char_margin=char_margin,
line_margin=line_margin,
word_margin=word_margin)
rsrcmgr = PDFResourceManager()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
layout = device.get_result()
lattice_objects = extract_text_objects(layout, LTChar)
stream_objects = extract_text_objects(
layout, LTTextLineHorizontal)
width = layout.bbox[2]
height = layout.bbox[3]
return lattice_objects, stream_objects, width, height

View File

@ -39,7 +39,7 @@ Usage
>>> extractor = Lattice(Pdf('us-030.pdf')) >>> extractor = Lattice(Pdf('us-030.pdf'))
>>> tables = extractor.get_tables() >>> tables = extractor.get_tables()
>>> print tables['pg-1'] >>> print tables['page-1'][0]
.. csv-table:: .. csv-table::
:header: "Cycle Name","KI (1/km)","Distance (mi)","Percent Fuel Savings","","","" :header: "Cycle Name","KI (1/km)","Distance (mi)","Percent Fuel Savings","","",""

View File

@ -65,7 +65,7 @@ Finally, the characters found on the page are assigned to cells based on their x
>>> extractor = Lattice(Pdf('us-030.pdf')) >>> extractor = Lattice(Pdf('us-030.pdf'))
>>> tables = extractor.get_tables() >>> tables = extractor.get_tables()
>>> print tables['pg-1'] >>> print tables['page-1'][0]
.. csv-table:: .. csv-table::
:header: "Cycle Name","KI (1/km)","Distance (mi)","Percent Fuel Savings","","","" :header: "Cycle Name","KI (1/km)","Distance (mi)","Percent Fuel Savings","","",""
@ -114,7 +114,7 @@ In the PDF used above, you can see that some cells spanned a lot of rows, `fill`
>>> extractor = Lattice(Pdf('row_span_1.pdf'), fill='v', scale=40) >>> extractor = Lattice(Pdf('row_span_1.pdf'), fill='v', scale=40)
>>> tables = extractor.get_tables() >>> tables = extractor.get_tables()
>>> print tables['pg-1'] >>> print tables['page-1'][0]
.. csv-table:: .. csv-table::
:header: "Plan Type","County","Plan Name","Totals" :header: "Plan Type","County","Plan Name","Totals"
@ -173,7 +173,7 @@ To find line segments, Lattice needs the lines of the PDF to be in foreground. S
>>> extractor = Lattice(Pdf('lines_in_background_1.pdf'), invert=True) >>> extractor = Lattice(Pdf('lines_in_background_1.pdf'), invert=True)
>>> tables = extractor.get_tables() >>> tables = extractor.get_tables()
>>> print tables['pg-1'] >>> print tables['page-1'][0]
.. csv-table:: .. csv-table::
:header: "State","Date","Halt stations","Halt days","Persons directly reached(in lakh)","Persons trained","Persons counseled","Persons testedfor HIV" :header: "State","Date","Halt stations","Halt days","Persons directly reached(in lakh)","Persons trained","Persons counseled","Persons testedfor HIV"

View File

@ -17,7 +17,7 @@ Let's run it on this PDF.
>>> extractor = Stream(Pdf('eu-027.pdf')) >>> extractor = Stream(Pdf('eu-027.pdf'))
>>> tables = extractor.get_tables() >>> tables = extractor.get_tables()
>>> print tables['pg-1'] >>> print tables['page-1'][0]
.. .. _this: insert link for eu-027.pdf .. .. _this: insert link for eu-027.pdf
@ -68,7 +68,7 @@ But sometimes its guess could be incorrect, like in this case.
>>> extractor = Stream(Pdf('missing_values.pdf')) >>> extractor = Stream(Pdf('missing_values.pdf'))
>>> tables = extractor.get_tables() >>> tables = extractor.get_tables()
>>> print tables['pg-1'] >>> print tables['page-1'][0]
.. .. _this: insert link for missing_values.pdf .. .. _this: insert link for missing_values.pdf
@ -127,7 +127,7 @@ It guessed that the PDF has 3 columns, because there wasn't any data in the last
>>> extractor = Stream(Pdf('missing_values.pdf'), ncolumns=5) >>> extractor = Stream(Pdf('missing_values.pdf'), ncolumns=5)
>>> tables = extractor.get_tables() >>> tables = extractor.get_tables()
>>> print tables['pg-1'] >>> print tables['page-1'][0]
.. csv-table:: .. csv-table::
@ -200,7 +200,7 @@ After getting the x-coordinates, we just need to pass them to Stream, like this.
>>> extractor = Stream(Pdf('mexican_towns.pdf'), columns='28,67,180,230,425,475,700') >>> extractor = Stream(Pdf('mexican_towns.pdf'), columns='28,67,180,230,425,475,700')
>>> tables = extractor.get_tables() >>> tables = extractor.get_tables()
>>> print tables['pg-1'] >>> print tables['page-1'][0]
.. csv-table:: .. csv-table::

View File

@ -26,7 +26,7 @@ def test_lattice_basic():
extractor = Lattice(Pdf(pdfname, extractor = Lattice(Pdf(pdfname,
pagenos=[{'start': 2, 'end': 2}], clean=True)) pagenos=[{'start': 2, 'end': 2}], clean=True))
tables = extractor.get_tables() tables = extractor.get_tables()
assert_equal(tables['pg-2'][0], data) assert_equal(tables['page-2'][0], data)
def test_lattice_fill(): def test_lattice_fill():
@ -76,7 +76,7 @@ def test_lattice_fill():
pdfname = os.path.join(testdir, 'row_span_1.pdf') pdfname = os.path.join(testdir, 'row_span_1.pdf')
extractor = Lattice(Pdf(pdfname, clean=True), fill='v', scale=40) extractor = Lattice(Pdf(pdfname, clean=True), fill='v', scale=40)
tables = extractor.get_tables() tables = extractor.get_tables()
assert_equal(tables['pg-1'][0], data) assert_equal(tables['pagea-1'][0], data)
def test_lattice_invert(): def test_lattice_invert():
@ -94,4 +94,4 @@ def test_lattice_invert():
pdfname = os.path.join(testdir, 'lines_in_background_1.pdf') pdfname = os.path.join(testdir, 'lines_in_background_1.pdf')
extractor = Lattice(Pdf(pdfname, clean=True), invert=True) extractor = Lattice(Pdf(pdfname, clean=True), invert=True)
tables = extractor.get_tables() tables = extractor.get_tables()
assert_equal(tables['pg-1'][1], data) assert_equal(tables['page-1'][1], data)

View File

@ -13,57 +13,62 @@ testdir = os.path.dirname(os.path.abspath(__file__))
def test_stream_basic(): def test_stream_basic():
data = [ data = [
["","","","",""], ["Clave","Nombre Entidad","Clave","Nombre Municipio","Clave","Nombre Localidad"],
["C Appendix C: Summary Statistics","","","",""], ["Entidad","","Municipio","","Localidad",""],
["","Table C1: Summary Statistics","","",""], ["01","Aguascalientes","001","Aguascalientes","0094","Granja Adelita"],
["","This table contains summary statistics for 2,012 respondents in SAVE 2009.","","",""], ["01","Aguascalientes","001","Aguascalientes","0096","Agua Azul"],
["Variable","Mean","Std. Dev. Min","","Max"], ["01","Aguascalientes","001","Aguascalientes","0100","Rancho Alegre"],
["Age","50.8","15.9","21","90"], ["01","Aguascalientes","001","Aguascalientes","0102","Los Arbolitos [Rancho]"],
["Men","0.47","0.50","0","1"], ["01","Aguascalientes","001","Aguascalientes","0104","Ardillas de Abajo (Las Ardillas)"],
["East","0.28","0.45","0","1"], ["01","Aguascalientes","001","Aguascalientes","0106","Arellano"],
["Rural","0.15","0.36","0","1"], ["01","Aguascalientes","001","Aguascalientes","0112","Bajío los Vázquez"],
["Married","0.57","0.50","0","1"], ["01","Aguascalientes","001","Aguascalientes","0113","Bajío de Montoro"],
["Single","0.21","0.40","0","1"], ["01","Aguascalientes","001","Aguascalientes","0114","Residencial San Nicolás [Baños la Cantera]"],
["Divorced","0.13","0.33","0","1"], ["01","Aguascalientes","001","Aguascalientes","0120","Buenavista de Peñuelas"],
["Widowed","0.08","0.26","0","1"], ["01","Aguascalientes","001","Aguascalientes","0121","Cabecita 3 Marías (Rancho Nuevo)"],
["Separated","0.03","0.16","0","1"], ["01","Aguascalientes","001","Aguascalientes","0125","Cañada Grande de Cotorina"],
["Partner","0.65","0.48","0","1"], ["01","Aguascalientes","001","Aguascalientes","0126","Cañada Honda [Estación]"],
["Employed","0.55","0.50","0","1"], ["01","Aguascalientes","001","Aguascalientes","0127","Los Caños"],
["Fulltime","0.34","0.47","0","1"], ["01","Aguascalientes","001","Aguascalientes","0128","El Cariñán"],
["Parttime","0.20","0.40","0","1"], ["01","Aguascalientes","001","Aguascalientes","0129","El Carmen [Granja]"],
["Unemployed","0.08","0.28","0","1"], ["01","Aguascalientes","001","Aguascalientes","0135","El Cedazo (Cedazo de San Antonio)"],
["Homemaker","0.19","0.40","0","1"], ["01","Aguascalientes","001","Aguascalientes","0138","Centro de Arriba (El Taray)"],
["Retired","0.28","0.45","0","1"], ["01","Aguascalientes","001","Aguascalientes","0139","Cieneguilla (La Lumbrera)"],
["Household size","2.43","1.22","1","9"], ["01","Aguascalientes","001","Aguascalientes","0141","Cobos"],
["Households with children","0.37","0.48","0","1"], ["01","Aguascalientes","001","Aguascalientes","0144","El Colorado (El Soyatal)"],
["Number of children","1.67","1.38","0","8"], ["01","Aguascalientes","001","Aguascalientes","0146","El Conejal"],
["Lower secondary education","0.08","0.27","0","1"], ["01","Aguascalientes","001","Aguascalientes","0157","Cotorina de Abajo"],
["Upper secondary education","0.60","0.49","0","1"], ["01","Aguascalientes","001","Aguascalientes","0162","Coyotes"],
["Post secondary, non tert. education","0.12","0.33","0","1"], ["01","Aguascalientes","001","Aguascalientes","0166","La Huerta (La Cruz)"],
["First stage tertiary education","0.17","0.38","0","1"], ["01","Aguascalientes","001","Aguascalientes","0170","Cuauhtémoc (Las Palomas)"],
["Other education","0.03","0.17","0","1"], ["01","Aguascalientes","001","Aguascalientes","0171","Los Cuervos (Los Ojos de Agua)"],
["Household income (Euro/month)","2,127","1,389","22","22,500"], ["01","Aguascalientes","001","Aguascalientes","0172","San José [Granja]"],
["Gross wealth - end of 2007 (Euro)","187,281","384,198","0","7,720,000"], ["01","Aguascalientes","001","Aguascalientes","0176","La Chiripa"],
["Gross financial wealth - end of 2007 (Euro)","38,855","114,128","0","2,870,000"], ["01","Aguascalientes","001","Aguascalientes","0182","Dolores"],
["","Source: SAVE 2008 and 2009, data is weighted and imputed.","","",""], ["01","Aguascalientes","001","Aguascalientes","0183","Los Dolores"],
["","","","","ECB"], ["01","Aguascalientes","001","Aguascalientes","0190","El Duraznillo"],
["","","","","Working Paper Series No 1299"], ["01","Aguascalientes","001","Aguascalientes","0191","Los Durón"],
["","","","","Febuary 2011"] ["01","Aguascalientes","001","Aguascalientes","0197","La Escondida"],
["01","Aguascalientes","001","Aguascalientes","0201","Brande Vin [Bodegas]"],
["01","Aguascalientes","001","Aguascalientes","0207","Valle Redondo"],
["01","Aguascalientes","001","Aguascalientes","0209","La Fortuna"],
["01","Aguascalientes","001","Aguascalientes","0212","Lomas del Gachupín"],
["01","Aguascalientes","001","Aguascalientes","0213","El Carmen (Gallinas Güeras) [Rancho]"],
["01","Aguascalientes","001","Aguascalientes","0216","La Gloria"],
["01","Aguascalientes","001","Aguascalientes","0226","Hacienda Nueva"],
] ]
pdfname = os.path.join(testdir, pdfname = os.path.join(testdir, 'mexican_towns.pdf')
"tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-027.pdf") extractor = Stream(Pdf(pdfname, pagenos=[{'start': 1, 'end': 1}],
extractor = Stream(Pdf(pdfname, pagenos=[{'start': 3, 'end': 3}],
clean=True)) clean=True))
tables = extractor.get_tables() tables = extractor.get_tables()
assert_equal(tables['pg-3'][0], data) assert_equal(tables['page-1'][0], data)
def test_stream_ncolumns(): def test_stream_ncolumns():
data = [ data = [
["","","","",""], ["Bhandara - Key Indicators","","","",""],
["","Bhandara - Key Indicators","","",""],
["","DLHS-4 (2012-13)","","DLHS-3 (2007-08)",""], ["","DLHS-4 (2012-13)","","DLHS-3 (2007-08)",""],
["Indicators","TOTAL","RURAL","TOTAL","RURAL"], ["Indicators","TOTAL","RURAL","TOTAL","RURAL"],
["Reported Prevalence of Morbidity","","","",""], ["Reported Prevalence of Morbidity","","","",""],
@ -105,21 +110,20 @@ def test_stream_ncolumns():
["Moderately High (Systolic >160 mm of Hg & Diastolic >100 mm of Hg ) .....................................","8.2","7.1","",""], ["Moderately High (Systolic >160 mm of Hg & Diastolic >100 mm of Hg ) .....................................","8.2","7.1","",""],
["Very High (Systolic >180 mm of Hg & Diastolic >110 mm of Hg ) ...............................................","3.7","3.1","",""], ["Very High (Systolic >180 mm of Hg & Diastolic >110 mm of Hg ) ...............................................","3.7","3.1","",""],
["14 Any anaemia below 11g/dl, severe anaemia below 7g/dl. 15 Excluding age group 19 years","","","",""], ["14 Any anaemia below 11g/dl, severe anaemia below 7g/dl. 15 Excluding age group 19 years","","","",""],
["","Chronic Illness :Any person with symptoms persisting for longer than one month is defined as suffering from chronic illness","","",""] ["Chronic Illness :Any person with symptoms persisting for longer than one month is defined as suffering from chronic illness","","","",""],
["","4","","",""]
] ]
pdfname = os.path.join(testdir, 'missing_values.pdf') pdfname = os.path.join(testdir, 'missing_values.pdf')
extractor = Stream(Pdf(pdfname, char_margin=1.0, clean=True), extractor = Stream(Pdf(pdfname, char_margin=1.0, clean=True),
ncolumns=5) ncolumns=5)
tables = extractor.get_tables() tables = extractor.get_tables()
assert_equal(tables['pg-1'][0], data) assert_equal(tables['page-1'][0], data)
def test_stream_columns(): def test_stream_columns():
data = [ data = [
["","","","","",""], ["Clave","Nombre Entidad","Clave","Nombre Municipio","Clave","Nombre Localidad"],
["Clave","","Clave","","Clave",""],
["","Nombre Entidad","","Nombre Municipio","","Nombre Localidad"],
["Entidad","","Municipio","","Localidad",""], ["Entidad","","Municipio","","Localidad",""],
["01","Aguascalientes","001","Aguascalientes","0094","Granja Adelita"], ["01","Aguascalientes","001","Aguascalientes","0094","Granja Adelita"],
["01","Aguascalientes","001","Aguascalientes","0096","Agua Azul"], ["01","Aguascalientes","001","Aguascalientes","0096","Agua Azul"],
@ -160,10 +164,11 @@ def test_stream_columns():
["01","Aguascalientes","001","Aguascalientes","0209","La Fortuna"], ["01","Aguascalientes","001","Aguascalientes","0209","La Fortuna"],
["01","Aguascalientes","001","Aguascalientes","0212","Lomas del Gachupín"], ["01","Aguascalientes","001","Aguascalientes","0212","Lomas del Gachupín"],
["01","Aguascalientes","001","Aguascalientes","0213","El Carmen (Gallinas Güeras) [Rancho]"], ["01","Aguascalientes","001","Aguascalientes","0213","El Carmen (Gallinas Güeras) [Rancho]"],
["01","Aguascalientes","001","Aguascalientes","0216","La Gloria"] ["01","Aguascalientes","001","Aguascalientes","0216","La Gloria"],
["01","Aguascalientes","001","Aguascalientes","0226","Hacienda Nueva"],
] ]
pdfname = os.path.join(testdir, 'mexican_towns.pdf') pdfname = os.path.join(testdir, 'mexican_towns.pdf')
extractor = Stream(Pdf(pdfname, clean=True), extractor = Stream(Pdf(pdfname, clean=True),
columns='28,67,180,230,425,475,700') columns='28,67,180,230,425,475,700')
tables = extractor.get_tables() tables = extractor.get_tables()
assert_equal(tables['pg-1'][0], data) assert_equal(tables['page-1'][0], data)

View File

@ -4,8 +4,12 @@ import os
import sys import sys
import time import time
import logging import logging
import warnings
import numpy as np
from docopt import docopt from docopt import docopt
from collections import Counter
import matplotlib.pyplot as plt
from PyPDF2 import PdfFileReader from PyPDF2 import PdfFileReader
from camelot.pdf import Pdf from camelot.pdf import Pdf
@ -22,12 +26,23 @@ usage:
options: options:
-h, --help Show this screen. -h, --help Show this screen.
-v, --version Show version. -v, --version Show version.
-V, --verbose Verbose.
-p, --pages <pageno> Comma-separated list of page numbers. -p, --pages <pageno> Comma-separated list of page numbers.
Example: -p 1,3-6,10 [default: 1] Example: -p 1,3-6,10 [default: 1]
-P, --parallel Parallelize the parsing process.
-f, --format <format> Output format. (csv,tsv,html,json,xlsx) [default: csv] -f, --format <format> Output format. (csv,tsv,html,json,xlsx) [default: csv]
-l, --log Print log to file. -l, --log Log to file.
-V, --verbose Verbose.
-o, --output <directory> Output directory. -o, --output <directory> Output directory.
-M, --cmargin <cmargin> Char margin. Chars closer than cmargin are
grouped together to form a word. [default: 2.0]
-L, --lmargin <lmargin> Line margin. Lines closer than lmargin are
grouped together to form a textbox. [default: 0.5]
-W, --wmargin <wmargin> Word margin. Insert blank spaces between chars
if distance between words is greater than word
margin. [default: 0.1]
-S, --save-info Save parsing info for each page to a file.
-X, --plot <dist> Plot distributions. (page,all,rc)
-Z, --summary Summarize metrics.
camelot methods: camelot methods:
lattice Looks for lines between data. lattice Looks for lines between data.
@ -47,12 +62,12 @@ options:
cells. Example: -F h, -F v, -F hv cells. Example: -F h, -F v, -F hv
-s, --scale <scale> Scaling factor. Large scaling factor leads to -s, --scale <scale> Scaling factor. Large scaling factor leads to
smaller lines being detected. [default: 15] smaller lines being detected. [default: 15]
-i, --invert Invert pdf image to make sure that lines are
in foreground.
-j, --jtol <jtol> Tolerance to account for when comparing joint -j, --jtol <jtol> Tolerance to account for when comparing joint
and line coordinates. [default: 2] and line coordinates. [default: 2]
-m, --mtol <mtol> Tolerance to account for when merging lines -m, --mtol <mtol> Tolerance to account for when merging lines
which are very close. [default: 2] which are very close. [default: 2]
-i, --invert Invert pdf image to make sure that lines are
in foreground.
-d, --debug <debug> Debug by visualizing pdf geometry. -d, --debug <debug> Debug by visualizing pdf geometry.
(contour,line,joint,table) Example: -d table (contour,line,joint,table) Example: -d table
""" """
@ -69,17 +84,159 @@ options:
Example: -c 10.1,20.2,30.3 Example: -c 10.1,20.2,30.3
-y, --ytol <ytol> Tolerance to account for when grouping rows -y, --ytol <ytol> Tolerance to account for when grouping rows
together. [default: 2] together. [default: 2]
-M, --cmargin <cmargin> Char margin. Chars closer than cmargin are -m, --mtol <mtol> Tolerance to account for when merging columns
grouped together to form a word. [default: 2.0] together. [default: 2]
-L, --lmargin <lmargin> Line margin. Lines closer than lmargin are
grouped together to form a textbox. [default: 0.5]
-W, --wmargin <wmargin> Word margin. Insert blank spaces between chars
if distance between words is greater than word
margin. [default: 0.1]
-d, --debug Debug by visualizing textboxes. -d, --debug Debug by visualizing textboxes.
""" """
def plot_table_barchart(r, c, p, pno, tno):
row_idx = [i + 1 for i, row in enumerate(r)]
col_idx = [i + 1 for i, col in enumerate(c)]
r_index = np.arange(len(r))
c_index = np.arange(len(c))
width = 0.7
plt.figure(figsize=(8, 6))
plt.subplot(2, 1, 1)
plt.title('Percentage of empty cells in table: {0:.2f}'.format(p))
plt.xlabel('row index')
plt.ylabel('number of non-empty cells in row')
plt.bar(r_index, r)
plt.xticks(r_index + width * 0.5, row_idx)
plt.ylim(0, len(c))
plt.subplot(2, 1, 2)
plt.xlabel('column index')
plt.ylabel('number of non-empty cells in column')
plt.bar(c_index, c)
plt.xticks(c_index + width * 0.5, col_idx)
plt.ylim(0, len(r))
plt.savefig(''.join([pno, '_', tno, '.png']), dpi=300)
def plot_all_barchart(data, output):
r_empty_cells = []
for page_number in data.keys():
page = data[page_number]
for table_number in page.keys():
table = page[table_number]
r_empty_cells.extend([r / float(table['ncols']) for r in table['r_nempty_cells']])
c = Counter(r_empty_cells)
if 0.0 not in c:
c.update({0.0: 0})
if 1.0 not in c:
c.update({1.0: 0})
plt.figure(figsize=(8, 6))
plt.xlabel('percentage of non-empty cells in a row')
plt.ylabel('percentage of rows processed')
row_p = [count / float(sum(c.values())) for count in c.values()]
plt.bar(c.keys(), row_p, align='center', width=0.05)
plt.ylim(0, 1.0)
plt.savefig(''.join([output, '_all.png']), dpi=300)
def plot_rc_piechart(data, output):
from matplotlib import cm
tables = 0
rows, cols = [], []
for page_number in data.keys():
page = data[page_number]
for table_number in page.keys():
table = page[table_number]
tables += 1
rows.append(table['nrows'])
cols.append(table['ncols'])
r = Counter(rows)
c = Counter(cols)
plt.figure(figsize=(8, 6))
cs1 = cm.Set1(np.arange(len(r)) / float(len(r)))
ax1 = plt.subplot(211, aspect='equal')
ax1.pie(r.values(), colors=cs1, labels=r.keys(), startangle=90)
ax1.set_title('row distribution across tables')
cs2 = cm.Set1(np.arange(len(c)) / float(len(c)))
ax2 = plt.subplot(212, aspect='equal')
ax2.pie(c.values(), colors=cs2, labels=c.keys(), startangle=90)
ax2.set_title('column distribution across tables')
plt.savefig(''.join([output, '_rc.png']), dpi=300)
def summary(data, p_time):
from operator import itemgetter
from itertools import groupby
scores = []
continuous_tables = []
total_tables = 0
for page_number in data.keys():
page = data[page_number]
total_tables += len(page.keys())
for table_number in page.keys():
table = page[table_number]
continuous_tables.append((page_number, table_number, table['ncols']))
scores.append(table['score'])
avg_score = np.mean(scores)
ct_pages = []
header_string = ""
if len(continuous_tables) > 1:
tables = sorted(continuous_tables, key=lambda x: (int(x[0][5:]), int(x[1][6:])))
for k, g in groupby(tables, key=itemgetter(2)):
g = list(g)
tables_same_ncols = set([int(t[0][5:]) for t in g])
tables_same_ncols = sorted(list(tables_same_ncols))
for K, G in groupby(enumerate(tables_same_ncols), key=lambda (i, x): i - x):
G = list(G)
ct_pages.append((str(G[0][1]), str(G[-1][1])))
result_headers = []
for ct in ct_pages:
header_idx = {}
possible_headers = []
ncols = 0
for page_number in range(int(ct[0]), int(ct[1]) + 1):
page = data['page-{0}'.format(page_number)]
for table_number in page.keys():
table = page[table_number]
ncols = table['ncols']
for i, row in enumerate(table['data']):
try:
header_idx[tuple(row)].append(i)
except KeyError:
header_idx[tuple(row)] = [i]
possible_headers = sorted(header_idx, key=lambda k: len(header_idx[k]), reverse=True)[:10]
possible_headers = filter(lambda z: len(z) == ncols,
[filter(lambda x: x != '', p_h) for p_h in possible_headers])
modes = []
for p_h in possible_headers:
try:
modes.append((p_h, max(set(header_idx[p_h]), key=header_idx[p_h].count)))
except KeyError:
pass
header = modes[modes.index(min(modes, key=lambda x: x[1]))][0]
result_headers.append(header)
header_string = "Multi-page table headers*:\n"
header_string = ''.join([header_string, '\n'.join(['pages {0} -> {1}{2}{3}'.format(
'-'.join([cr[0][0], cr[0][1]]), '"', '","'.join(cr[1]), '"') for cr in zip(
ct_pages, result_headers)])])
avg_time = "Time taken per page: {0:.2f} seconds\n".format(
p_time / float(len(data))) if len(data) != 1 else ""
equal_ncols = "\nMulti-page tables on*: {0}\n".format(
', '.join(['-'.join(ct) for ct in ct_pages])) if len(data) != 1 else ""
stats = [len(data), p_time, avg_time, total_tables, avg_score, equal_ncols]
stat_string = ("Pages processed: {0}\nTime taken: {1:.2f} seconds\n"
"{2}Tables found: {3}\nAverage score: {4:.2f}{5}".format(*stats))
print(''.join([stat_string, header_string]))
def convert_to_html(table): def convert_to_html(table):
html = '' html = ''
html = ''.join([html, '<table border="1">\n']) html = ''.join([html, '<table border="1">\n'])
@ -99,23 +256,23 @@ def write_to_disk(data, f='csv', output=None, filename=None):
if f in ['csv', 'tsv']: if f in ['csv', 'tsv']:
import csv import csv
delimiter = ',' if f == 'csv' else '\t' delimiter = ',' if f == 'csv' else '\t'
for page in sorted(data): for page_number in sorted(data.keys()):
for table in range(len(data[page])): for table_number in sorted(data[page_number].keys()):
dsvname = '{0}_table_{1}.{2}'.format(page, table + 1, f) dsvname = '{0}.{1}'.format(''.join([page_number, '_', table_number]), f)
with open(os.path.join(output, dsvname), 'w') as outfile: with open(os.path.join(output, dsvname), 'w') as outfile:
writer = csv.writer( writer = csv.writer(
outfile, delimiter=delimiter, quoting=csv.QUOTE_ALL) outfile, delimiter=delimiter, quoting=csv.QUOTE_ALL)
for row in data[page][table]: for row in data[page_number][table_number]['data']:
writer.writerow(row) writer.writerow(row)
elif f == 'html': elif f == 'html':
htmlname = '{}.html'.format(froot) htmlname = '{0}.html'.format(froot)
for page in sorted(data): for page_number in sorted(data.keys()):
for table in range(len(data[page])): for table_number in sorted(data[page_number].keys()):
with open(os.path.join(output, htmlname), 'a') as htmlfile: with open(os.path.join(output, htmlname), 'a') as htmlfile:
htmlfile.write(convert_to_html(data[page][table])) htmlfile.write(convert_to_html(data[page_number][table_number]['data']))
elif f == 'json': elif f == 'json':
import json import json
with open(os.path.join(output, '{}.json'.format(froot)), 'w') \ with open(os.path.join(output, '{0}.json'.format(froot)), 'w') \
as jsonfile: as jsonfile:
json.dump(data, jsonfile) json.dump(data, jsonfile)
elif f == 'xlsx': elif f == 'xlsx':
@ -123,12 +280,12 @@ def write_to_disk(data, f='csv', output=None, filename=None):
from pyexcel_xlsx import save_data from pyexcel_xlsx import save_data
from collections import OrderedDict from collections import OrderedDict
xlsx_data = OrderedDict() xlsx_data = OrderedDict()
for page in sorted(data): for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
for table in range(len(data[page])): for table_number in sorted(data[page_number].keys(), key=lambda x: int(x[6:])):
sheet_name = '{0}_table_{1}'.format(page, table + 1) sheet_name = ''.join([page_number, '_', table_number])
xlsx_data.update({sheet_name: xlsx_data.update({sheet_name:
[row for row in data[page][table]]}) [row for row in data[page_number][table_number]['data']]})
save_data(os.path.join(output, '{}.xlsx'.format(froot)), xlsx_data) save_data(os.path.join(output, '{0}.xlsx'.format(froot)), xlsx_data)
except ImportError: except ImportError:
print("link to install docs") print("link to install docs")
@ -147,16 +304,17 @@ if __name__ == '__main__':
filename = args['<file>'] filename = args['<file>']
filedir = os.path.dirname(args['<file>']) filedir = os.path.dirname(args['<file>'])
logname, __ = os.path.splitext(filename) logname, __ = os.path.splitext(filename)
logname += '.log' logname = ''.join([logname, '.log'])
scorename, __ = os.path.splitext(filename)
scorename = ''.join([scorename, '_info.csv'])
pngname, __ = os.path.splitext(filename)
if args['--log']: if args['--log']:
FORMAT = '%(asctime)s - %(levelname)s - %(message)s'
if args['--output']: if args['--output']:
logname = os.path.join(args['--output'], os.path.basename(logname)) logname = os.path.join(args['--output'], os.path.basename(logname))
logging.basicConfig( logging.basicConfig(
filename=logname, filemode='w', level=logging.DEBUG) filename=logname, filemode='w', format=FORMAT, level=logging.DEBUG)
else:
logging.basicConfig(
filename=logname, filemode='w', level=logging.DEBUG)
p = [] p = []
if args['--pages'] == '1': if args['--pages'] == '1':
@ -173,47 +331,142 @@ if __name__ == '__main__':
else: else:
p.append({'start': int(r), 'end': int(r)}) p.append({'start': int(r), 'end': int(r)})
margin_tuple = (float(args['--cmargin']), float(args['--lmargin']),
float(args['--wmargin']))
if args['<method>'] == 'lattice': if args['<method>'] == 'lattice':
try: try:
extractor = Lattice(Pdf(filename, pagenos=p, clean=True), manager = Pdf(Lattice(
fill=args['--fill'], fill=args['--fill'],
scale=int(args['--scale']), scale=int(args['--scale']),
jtol=int(args['--jtol']), invert=args['--invert'],
mtol=int(args['--mtol']), jtol=int(args['--jtol']),
invert=args['--invert'], mtol=int(args['--mtol']),
debug=args['--debug'], pdf_margin=margin_tuple,
verbose=args['--verbose']) debug=args['--debug']),
data = extractor.get_tables() filename,
pagenos=p,
parallel=args['--parallel'],
clean=True)
data = manager.extract()
processing_time = time.time() - start_time
vprint("Finished processing in", processing_time, "seconds")
logging.info("Finished processing in " + str(processing_time) + " seconds")
if args['--plot']:
if args['--output']:
pngname = os.path.join(args['--output'], os.path.basename(pngname))
plot_type = args['--plot'].split(',')
if 'page' in plot_type:
for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
page = data[page_number]
for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
table = page[table_number]
plot_table_barchart(table['r_nempty_cells'],
table['c_nempty_cells'],
table['empty_p'],
page_number,
table_number)
if 'all' in plot_type:
plot_all_barchart(data, pngname)
if 'rc' in plot_type:
plot_rc_piechart(data, pngname)
if args['--summary']:
summary(data, processing_time)
if args['--save-info']:
if args['--output']:
scorename = os.path.join(args['--output'], os.path.basename(scorename))
with open(scorename, 'w') as score_file:
score_file.write('table,nrows,ncols,empty_p,line_p,text_p,score\n')
for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
page = data[page_number]
for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
table = page[table_number]
score_file.write('{0},{1},{2},{3},{4},{5},{6}\n'.format(
''.join([page_number, '_', table_number]),
table['nrows'],
table['ncols'],
table['empty_p'],
table['line_p'],
table['text_p'],
table['score']))
if args['--debug']: if args['--debug']:
extractor.plot_geometry(args['--debug']) manager.debug_plot()
except Exception as e: except Exception as e:
logging.exception(e.message, exc_info=True) logging.exception(e.message, exc_info=True)
sys.exit() sys.exit()
elif args['<method>'] == 'stream': elif args['<method>'] == 'stream':
try: try:
extractor = Stream(Pdf(filename, pagenos=p, manager = Pdf(Stream(
char_margin=float(args['--cmargin']), ncolumns=int(args['--ncols']),
line_margin=float(args['--lmargin']), columns=args['--columns'],
word_margin=float(args['--wmargin']), ytol=int(args['--ytol']),
clean=True), mtol=int(args['--mtol']),
ncolumns=int(args['--ncols']), pdf_margin=margin_tuple,
columns=args['--columns'], debug=args['--debug']),
ytol=int(args['--ytol']), filename,
debug=args['--debug'], pagenos=p,
verbose=args['--verbose']) parallel=args['--parallel'],
data = extractor.get_tables() clean=True)
data = manager.extract()
processing_time = time.time() - start_time
vprint("Finished processing in", processing_time, "seconds")
logging.info("Finished processing in " + str(processing_time) + " seconds")
if args['--plot']:
if args['--output']:
pngname = os.path.join(args['--output'], os.path.basename(pngname))
plot_type = args['--plot'].split(',')
if 'page' in plot_type:
for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
page = data[page_number]
for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
table = page[table_number]
plot_table_barchart(table['r_nempty_cells'],
table['c_nempty_cells'],
table['empty_p'],
page_number,
table_number)
if 'all' in plot_type:
plot_all_barchart(data, pngname)
if 'rc' in plot_type:
plot_rc_piechart(data, pngname)
if args['--summary']:
summary(data, processing_time)
if args['--save-info']:
if args['--output']:
scorename = os.path.join(args['--output'], os.path.basename(scorename))
with open(scorename, 'w') as score_file:
score_file.write('table,nrows,ncols,empty_p,,score\n')
for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
page = data[page_number]
for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
table = page[table_number]
score_file.write('{0},{1},{2},{3},{4}\n'.format(
''.join([page_number, '_', table_number]),
table['nrows'],
table['ncols'],
table['empty_p'],
table['score']))
if args['--debug']: if args['--debug']:
extractor.plot_text() manager.debug_plot()
except Exception as e: except Exception as e:
logging.exception(e.message, exc_info=True) logging.exception(e.message, exc_info=True)
sys.exit() sys.exit()
if data is None: if args['--debug']:
print("See 'camelot <method> -h' for various parameters you can tweak.") print("See 'camelot <method> -h' for various parameters you can tweak.")
else: else:
output = filedir if args['--output'] is None else args['--output'] output = filedir if args['--output'] is None else args['--output']
write_to_disk(data, f=args['--format'], write_to_disk(data, f=args['--format'],
output=output, filename=filename) output=output, filename=filename)
vprint("finished in", time.time() - start_time, "seconds")
logging.info("Time taken: " + str(time.time() - start_time) + " seconds")