Add various metrics to score the quality of a parse
Add various metrics to score the quality of a parsepull/2/head
parent
43a009dab4
commit
552f9cf422
|
|
@ -1,18 +1,31 @@
|
|||
from __future__ import print_function
|
||||
from __future__ import division
|
||||
import os
|
||||
import types
|
||||
import copy_reg
|
||||
import logging
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
from wand.image import Image
|
||||
|
||||
from .table import Table
|
||||
from .utils import (transform, elements_bbox, detect_vertical, merge_close_values,
|
||||
get_row_index, get_column_index, reduce_index, outline,
|
||||
fill_spanning, remove_empty, encode_list)
|
||||
get_row_index, get_column_index, get_score, reduce_index,
|
||||
outline, fill_spanning, count_empty, encode_list, pdf_to_text)
|
||||
|
||||
|
||||
__all__ = ['Lattice']
|
||||
|
||||
|
||||
def _reduce_method(m):
|
||||
if m.im_self is None:
|
||||
return getattr, (m.im_class, m.im_func.func_name)
|
||||
else:
|
||||
return getattr, (m.im_self, m.im_func.func_name)
|
||||
copy_reg.pickle(types.MethodType, _reduce_method)
|
||||
|
||||
|
||||
def _morph_transform(imagename, scale=15, invert=False):
|
||||
"""Morphological Transformation
|
||||
|
||||
|
|
@ -65,8 +78,8 @@ def _morph_transform(imagename, scale=15, invert=False):
|
|||
vertical = threshold
|
||||
horizontal = threshold
|
||||
|
||||
verticalsize = vertical.shape[0] / scale
|
||||
horizontalsize = horizontal.shape[1] / scale
|
||||
verticalsize = vertical.shape[0] // scale
|
||||
horizontalsize = horizontal.shape[1] // scale
|
||||
|
||||
ver = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize))
|
||||
hor = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontalsize, 1))
|
||||
|
|
@ -79,8 +92,12 @@ def _morph_transform(imagename, scale=15, invert=False):
|
|||
|
||||
mask = vertical + horizontal
|
||||
joints = np.bitwise_and(vertical, horizontal)
|
||||
try:
|
||||
__, contours, __ = cv2.findContours(
|
||||
mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
except ValueError:
|
||||
contours, __ = cv2.findContours(
|
||||
mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
|
||||
|
||||
tables = {}
|
||||
|
|
@ -88,8 +105,12 @@ def _morph_transform(imagename, scale=15, invert=False):
|
|||
c_poly = cv2.approxPolyDP(c, 3, True)
|
||||
x, y, w, h = cv2.boundingRect(c_poly)
|
||||
roi = joints[y : y + h, x : x + w]
|
||||
try:
|
||||
__, jc, __ = cv2.findContours(
|
||||
roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
|
||||
except ValueError:
|
||||
jc, __ = cv2.findContours(
|
||||
roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
|
||||
if len(jc) <= 4: # remove contours with less than <=4 joints
|
||||
continue
|
||||
joint_coords = []
|
||||
|
|
@ -100,16 +121,24 @@ def _morph_transform(imagename, scale=15, invert=False):
|
|||
tables[(x, y + h, x + w, y)] = joint_coords
|
||||
|
||||
v_segments, h_segments = [], []
|
||||
try:
|
||||
_, vcontours, _ = cv2.findContours(
|
||||
vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
except ValueError:
|
||||
vcontours, _ = cv2.findContours(
|
||||
vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
for vc in vcontours:
|
||||
x, y, w, h = cv2.boundingRect(vc)
|
||||
x1, x2 = x, x + w
|
||||
y1, y2 = y, y + h
|
||||
v_segments.append(((x1 + x2) / 2, y2, (x1 + x2) / 2, y1))
|
||||
|
||||
try:
|
||||
_, hcontours, _ = cv2.findContours(
|
||||
horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
except ValueError:
|
||||
hcontours, _ = cv2.findContours(
|
||||
horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
for hc in hcontours:
|
||||
x, y, w, h = cv2.boundingRect(hc)
|
||||
x1, x2 = x, x + w
|
||||
|
|
@ -160,24 +189,19 @@ class Lattice:
|
|||
page as value.
|
||||
"""
|
||||
|
||||
def __init__(self, pdfobject, fill=None, scale=15, jtol=2, mtol=2,
|
||||
invert=False, debug=None, verbose=False):
|
||||
def __init__(self, fill=None, scale=15, jtol=2, mtol=2,
|
||||
invert=False, pdf_margin=(2.0, 0.5, 0.1), debug=None):
|
||||
|
||||
self.pdfobject = pdfobject
|
||||
self.method = 'lattice'
|
||||
self.fill = fill
|
||||
self.scale = scale
|
||||
self.jtol = jtol
|
||||
self.mtol = mtol
|
||||
self.invert = invert
|
||||
self.char_margin, self.line_margin, self.word_margin = pdf_margin
|
||||
self.debug = debug
|
||||
self.verbose = verbose
|
||||
self.tables = {}
|
||||
if self.debug is not None:
|
||||
self.debug_images = {}
|
||||
self.debug_segments = {}
|
||||
self.debug_tables = {}
|
||||
|
||||
def get_tables(self):
|
||||
def get_tables(self, pdfname):
|
||||
"""Returns all tables found in given pdf.
|
||||
|
||||
Returns
|
||||
|
|
@ -186,14 +210,16 @@ class Lattice:
|
|||
Dictionary with page number as key and list of tables on that
|
||||
page as value.
|
||||
"""
|
||||
vprint = print if self.verbose else lambda *a, **k: None
|
||||
self.pdfobject.split()
|
||||
self.pdfobject.convert()
|
||||
for page in self.pdfobject.extract():
|
||||
p, text, __, width, height = page
|
||||
pkey = 'pg-{0}'.format(p)
|
||||
imagename = os.path.join(
|
||||
self.pdfobject.temp, '{}.png'.format(pkey))
|
||||
text, __, width, height = pdf_to_text(pdfname, self.char_margin,
|
||||
self.line_margin, self.word_margin)
|
||||
bname, __ = os.path.splitext(pdfname)
|
||||
if not text:
|
||||
logging.warning("{0}: PDF has no text. It may be an image.".format(
|
||||
os.path.basename(bname)))
|
||||
return None
|
||||
imagename = ''.join([bname, '.png'])
|
||||
with Image(filename=pdfname, depth=8, resolution=300) as png:
|
||||
png.save(filename=imagename)
|
||||
pdf_x = width
|
||||
pdf_y = height
|
||||
img, table_bbox, v_segments, h_segments = _morph_transform(
|
||||
|
|
@ -203,24 +229,27 @@ class Lattice:
|
|||
scaling_factor_x = pdf_x / float(img_x)
|
||||
scaling_factor_y = pdf_y / float(img_y)
|
||||
|
||||
if self.debug is not None:
|
||||
self.debug_images[pkey] = (img, table_bbox)
|
||||
if self.debug:
|
||||
self.debug_images = (img, table_bbox)
|
||||
|
||||
factors = (scaling_factor_x, scaling_factor_y, img_y)
|
||||
table_bbox, v_segments, h_segments = transform(table_bbox, v_segments,
|
||||
h_segments, factors)
|
||||
|
||||
if self.debug is not None:
|
||||
self.debug_segments[pkey] = (v_segments, h_segments)
|
||||
if self.debug:
|
||||
self.debug_segments = (v_segments, h_segments)
|
||||
self.debug_tables = []
|
||||
|
||||
if self.debug is not None:
|
||||
debug_page_tables = []
|
||||
page_tables = []
|
||||
pdf_page = {}
|
||||
page_tables = {}
|
||||
table_no = 1
|
||||
# sort tables based on y-coord
|
||||
for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True):
|
||||
# select edges which lie within table_bbox
|
||||
table_info = {}
|
||||
text_bbox, v_s, h_s = elements_bbox(k, text, v_segments,
|
||||
h_segments)
|
||||
table_info['text_p'] = 100 * (1 - (len(text_bbox) / len(text)))
|
||||
rotated = detect_vertical(text_bbox)
|
||||
cols, rows = zip(*table_bbox[k])
|
||||
cols, rows = list(cols), list(rows)
|
||||
|
|
@ -238,13 +267,15 @@ class Lattice:
|
|||
table = Table(cols, rows)
|
||||
# set table edges to True using ver+hor lines
|
||||
table = table.set_edges(v_s, h_s, jtol=self.jtol)
|
||||
nouse = table.nocont_ / (len(v_s) + len(h_s))
|
||||
table_info['line_p'] = 100 * (1 - nouse)
|
||||
# set spanning cells to True
|
||||
table = table.set_spanning()
|
||||
# set table border edges to True
|
||||
table = outline(table)
|
||||
|
||||
if self.debug is not None:
|
||||
debug_page_tables.append(table)
|
||||
if self.debug:
|
||||
self.debug_tables.append(table)
|
||||
|
||||
# fill text after sorting it
|
||||
if rotated == '':
|
||||
|
|
@ -253,17 +284,28 @@ class Lattice:
|
|||
text_bbox.sort(key=lambda x: (x.x0, x.y0))
|
||||
elif rotated == 'right':
|
||||
text_bbox.sort(key=lambda x: (-x.x0, -x.y0))
|
||||
|
||||
rerror = []
|
||||
cerror = []
|
||||
for t in text_bbox:
|
||||
r_idx = get_row_index(t, rows)
|
||||
c_idx = get_column_index(t, cols)
|
||||
if None in [r_idx, c_idx]:
|
||||
try:
|
||||
r_idx, rass_error = get_row_index(t, rows)
|
||||
except TypeError:
|
||||
# couldn't assign LTChar to any cell
|
||||
pass
|
||||
else:
|
||||
continue
|
||||
try:
|
||||
c_idx, cass_error = get_column_index(t, cols)
|
||||
except TypeError:
|
||||
# couldn't assign LTChar to any cell
|
||||
continue
|
||||
rerror.append(rass_error)
|
||||
cerror.append(cass_error)
|
||||
r_idx, c_idx = reduce_index(
|
||||
table, rotated, r_idx, c_idx)
|
||||
table.cells[r_idx][c_idx].add_text(
|
||||
t.get_text().strip('\n'))
|
||||
score = get_score([[50, rerror], [50, cerror]])
|
||||
table_info['score'] = score
|
||||
|
||||
if self.fill is not None:
|
||||
table = fill_spanning(table, fill=self.fill)
|
||||
|
|
@ -273,82 +315,19 @@ class Lattice:
|
|||
elif rotated == 'right':
|
||||
ar = zip(*ar[::1])
|
||||
ar.reverse()
|
||||
ar = remove_empty(ar)
|
||||
ar = [list(o) for o in ar]
|
||||
page_tables.append(encode_list(ar))
|
||||
vprint(pkey)
|
||||
self.tables[pkey] = page_tables
|
||||
ar = encode_list(ar)
|
||||
table_info['data'] = ar
|
||||
empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)
|
||||
table_info['empty_p'] = empty_p
|
||||
table_info['r_nempty_cells'] = r_nempty_cells
|
||||
table_info['c_nempty_cells'] = c_nempty_cells
|
||||
table_info['nrows'] = len(ar)
|
||||
table_info['ncols'] = len(ar[0])
|
||||
page_tables['table_{0}'.format(table_no)] = table_info
|
||||
table_no += 1
|
||||
pdf_page[os.path.basename(bname)] = page_tables
|
||||
|
||||
if self.debug is not None:
|
||||
self.debug_tables[pkey] = debug_page_tables
|
||||
|
||||
if self.pdfobject.clean:
|
||||
self.pdfobject.remove_tempdir()
|
||||
|
||||
if self.debug is not None:
|
||||
if self.debug:
|
||||
return None
|
||||
|
||||
return self.tables
|
||||
|
||||
def plot_geometry(self, geometry):
|
||||
"""Plots various pdf geometries that are detected so user can choose
|
||||
tweak scale, jtol, mtol parameters.
|
||||
"""
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
if geometry == 'contour':
|
||||
for pkey in self.debug_images.keys():
|
||||
img, table_bbox = self.debug_images[pkey]
|
||||
for t in table_bbox.keys():
|
||||
cv2.rectangle(img, (t[0], t[1]),
|
||||
(t[2], t[3]), (255, 0, 0), 3)
|
||||
plt.imshow(img)
|
||||
plt.show()
|
||||
elif geometry == 'joint':
|
||||
x_coord = []
|
||||
y_coord = []
|
||||
for pkey in self.debug_images.keys():
|
||||
img, table_bbox = self.debug_images[pkey]
|
||||
for k in table_bbox.keys():
|
||||
for coord in table_bbox[k]:
|
||||
x_coord.append(coord[0])
|
||||
y_coord.append(coord[1])
|
||||
max_x, max_y = max(x_coord), max(y_coord)
|
||||
plt.plot(x_coord, y_coord, 'ro')
|
||||
plt.axis([0, max_x + 100, max_y + 100, 0])
|
||||
plt.imshow(img)
|
||||
plt.show()
|
||||
elif geometry == 'line':
|
||||
for pkey in self.debug_segments.keys():
|
||||
v_s, h_s = self.debug_segments[pkey]
|
||||
for v in v_s:
|
||||
plt.plot([v[0], v[2]], [v[1], v[3]])
|
||||
for h in h_s:
|
||||
plt.plot([h[0], h[2]], [h[1], h[3]])
|
||||
plt.show()
|
||||
elif geometry == 'table':
|
||||
for pkey in self.debug_tables.keys():
|
||||
for table in self.debug_tables[pkey]:
|
||||
for i in range(len(table.cells)):
|
||||
for j in range(len(table.cells[i])):
|
||||
if table.cells[i][j].left:
|
||||
plt.plot([table.cells[i][j].lb[0],
|
||||
table.cells[i][j].lt[0]],
|
||||
[table.cells[i][j].lb[1],
|
||||
table.cells[i][j].lt[1]])
|
||||
if table.cells[i][j].right:
|
||||
plt.plot([table.cells[i][j].rb[0],
|
||||
table.cells[i][j].rt[0]],
|
||||
[table.cells[i][j].rb[1],
|
||||
table.cells[i][j].rt[1]])
|
||||
if table.cells[i][j].top:
|
||||
plt.plot([table.cells[i][j].lt[0],
|
||||
table.cells[i][j].rt[0]],
|
||||
[table.cells[i][j].lt[1],
|
||||
table.cells[i][j].rt[1]])
|
||||
if table.cells[i][j].bottom:
|
||||
plt.plot([table.cells[i][j].lb[0],
|
||||
table.cells[i][j].rb[0]],
|
||||
[table.cells[i][j].lb[1],
|
||||
table.cells[i][j].rb[1]])
|
||||
plt.show()
|
||||
return pdf_page
|
||||
224
camelot/pdf.py
224
camelot/pdf.py
|
|
@ -1,18 +1,11 @@
|
|||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
import itertools
|
||||
import multiprocessing as mp
|
||||
|
||||
import cv2
|
||||
from PyPDF2 import PdfFileReader, PdfFileWriter
|
||||
from pdfminer.pdfparser import PDFParser
|
||||
from pdfminer.pdfdocument import PDFDocument
|
||||
from pdfminer.pdfpage import PDFPage
|
||||
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
|
||||
from pdfminer.pdfinterp import PDFResourceManager
|
||||
from pdfminer.pdfinterp import PDFPageInterpreter
|
||||
from pdfminer.pdfdevice import PDFDevice
|
||||
from pdfminer.converter import PDFPageAggregator
|
||||
from pdfminer.layout import LAParams, LTChar, LTTextLineHorizontal
|
||||
from wand.image import Image
|
||||
|
||||
|
||||
__all__ = ['Pdf']
|
||||
|
|
@ -38,38 +31,6 @@ def _parse_page_numbers(pagenos):
|
|||
return page_numbers
|
||||
|
||||
|
||||
def _extract_text_objects(layout, LTObject, t=None):
|
||||
"""Recursively parses pdf layout to get a list of
|
||||
text objects.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
layout : object
|
||||
Layout object.
|
||||
|
||||
LTObject : object
|
||||
Text object, either LTChar or LTTextLineHorizontal.
|
||||
|
||||
t : list (optional, default: None)
|
||||
|
||||
Returns
|
||||
-------
|
||||
t : list
|
||||
List of text objects.
|
||||
"""
|
||||
if t is None:
|
||||
t = []
|
||||
try:
|
||||
for obj in layout._objs:
|
||||
if isinstance(obj, LTObject):
|
||||
t.append(obj)
|
||||
else:
|
||||
t += _extract_text_objects(obj, LTObject)
|
||||
except AttributeError:
|
||||
pass
|
||||
return t
|
||||
|
||||
|
||||
class Pdf:
|
||||
"""Handles all pdf operations which include:
|
||||
|
||||
|
|
@ -99,66 +60,163 @@ class Pdf:
|
|||
is greater than word_margin. (optional, default: 0.1)
|
||||
"""
|
||||
|
||||
def __init__(self, pdfname, pagenos=[{'start': 1, 'end': 1}],
|
||||
char_margin=2.0, line_margin=0.5, word_margin=0.1,
|
||||
clean=False):
|
||||
def __init__(self, extractor, pdfname, pagenos=[{'start': 1, 'end': 1}],
|
||||
parallel=False, clean=False):
|
||||
|
||||
self.extractor = extractor
|
||||
self.pdfname = pdfname
|
||||
if not self.pdfname.endswith('.pdf'):
|
||||
raise TypeError("Only PDF format is supported right now.")
|
||||
self.pagenos = _parse_page_numbers(pagenos)
|
||||
self.char_margin = char_margin
|
||||
self.line_margin = line_margin
|
||||
self.word_margin = word_margin
|
||||
self.parallel = parallel
|
||||
self.cpu_count = mp.cpu_count()
|
||||
self.pool = mp.Pool(processes=self.cpu_count)
|
||||
self.clean = clean
|
||||
self.temp = tempfile.mkdtemp()
|
||||
|
||||
def split(self):
|
||||
"""Splits pdf into single page pdfs.
|
||||
"""
|
||||
if not self.pdfname.endswith('.pdf'):
|
||||
raise TypeError("Only PDF format is supported.")
|
||||
infile = PdfFileReader(open(self.pdfname, 'rb'), strict=False)
|
||||
for p in self.pagenos:
|
||||
page = infile.getPage(p - 1)
|
||||
outfile = PdfFileWriter()
|
||||
outfile.addPage(page)
|
||||
with open(os.path.join(self.temp, 'pg-{0}.pdf'.format(p)), 'wb') as f:
|
||||
with open(os.path.join(self.temp, 'page-{0}.pdf'.format(p)), 'wb') as f:
|
||||
outfile.write(f)
|
||||
|
||||
def remove_tempdir(self):
|
||||
shutil.rmtree(self.temp)
|
||||
|
||||
def extract(self):
|
||||
"""Extracts text objects, width, height from a pdf.
|
||||
"""
|
||||
for p in self.pagenos:
|
||||
pkey = 'pg-{0}'.format(p)
|
||||
pname = os.path.join(self.temp, '{}.pdf'.format(pkey))
|
||||
with open(pname, 'r') as f:
|
||||
parser = PDFParser(f)
|
||||
document = PDFDocument(parser)
|
||||
if not document.is_extractable:
|
||||
raise PDFTextExtractionNotAllowed
|
||||
laparams = LAParams(char_margin=self.char_margin,
|
||||
line_margin=self.line_margin,
|
||||
word_margin=self.word_margin)
|
||||
rsrcmgr = PDFResourceManager()
|
||||
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
|
||||
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||
for page in PDFPage.create_pages(document):
|
||||
interpreter.process_page(page)
|
||||
layout = device.get_result()
|
||||
lattice_objects = _extract_text_objects(layout, LTChar)
|
||||
stream_objects = _extract_text_objects(
|
||||
layout, LTTextLineHorizontal)
|
||||
width = layout.bbox[2]
|
||||
height = layout.bbox[3]
|
||||
yield p, lattice_objects, stream_objects, width, height
|
||||
self.split()
|
||||
pages = [os.path.join(self.temp, 'page-{0}.pdf'.format(p))
|
||||
for p in self.pagenos]
|
||||
if self.parallel:
|
||||
tables = self.pool.map(self.extractor.get_tables, pages)
|
||||
tables = {k: v for d in tables if d is not None for k, v in d.items()}
|
||||
else:
|
||||
tables = {}
|
||||
if self.extractor.debug:
|
||||
if self.extractor.method == 'stream':
|
||||
self.debug = self.extractor.debug
|
||||
self.debug_text = []
|
||||
elif self.extractor.method == 'lattice':
|
||||
self.debug = self.extractor.debug
|
||||
self.debug_images = []
|
||||
self.debug_segments = []
|
||||
self.debug_tables = []
|
||||
for p in pages:
|
||||
table = self.extractor.get_tables(p)
|
||||
if table is not None:
|
||||
tables.update(table)
|
||||
if self.extractor.debug:
|
||||
if self.extractor.method == 'stream':
|
||||
self.debug_text.append(self.extractor.debug_text)
|
||||
elif self.extractor.method == 'lattice':
|
||||
self.debug_images.append(self.extractor.debug_images)
|
||||
self.debug_segments.append(self.extractor.debug_segments)
|
||||
self.debug_tables.append(self.extractor.debug_tables)
|
||||
if self.clean:
|
||||
self.remove_tempdir()
|
||||
return tables
|
||||
|
||||
def convert(self):
|
||||
"""Converts single page pdfs to images.
|
||||
def debug_plot(self):
|
||||
"""Plots all text objects and various pdf geometries so that
|
||||
user can choose number of columns, columns x-coordinates for
|
||||
Stream or tweak Lattice parameters (scale, jtol, mtol).
|
||||
"""
|
||||
for p in self.pagenos:
|
||||
pdfname = os.path.join(self.temp, 'pg-{0}.pdf'.format(p))
|
||||
imagename = os.path.join(self.temp, 'pg-{0}.png'.format(p))
|
||||
with Image(filename=pdfname, depth=8, resolution=300) as png:
|
||||
png.save(filename=imagename)
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib.patches as patches
|
||||
|
||||
def remove_tempdir(self):
|
||||
shutil.rmtree(self.temp)
|
||||
if self.debug is True:
|
||||
try:
|
||||
for text in self.debug_text:
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111, aspect='equal')
|
||||
xs, ys = [], []
|
||||
for t in text:
|
||||
xs.extend([t[0], t[1]])
|
||||
ys.extend([t[2], t[3]])
|
||||
ax.add_patch(
|
||||
patches.Rectangle(
|
||||
(t[0], t[1]),
|
||||
t[2] - t[0],
|
||||
t[3] - t[1]
|
||||
)
|
||||
)
|
||||
ax.set_xlim(min(xs) - 10, max(xs) + 10)
|
||||
ax.set_ylim(min(ys) - 10, max(ys) + 10)
|
||||
plt.show()
|
||||
except AttributeError:
|
||||
raise ValueError("This option only be used with Stream.")
|
||||
elif self.debug == 'contour':
|
||||
try:
|
||||
for img, table_bbox in self.debug_images:
|
||||
for t in table_bbox.keys():
|
||||
cv2.rectangle(img, (t[0], t[1]),
|
||||
(t[2], t[3]), (255, 0, 0), 3)
|
||||
plt.imshow(img)
|
||||
plt.show()
|
||||
except AttributeError:
|
||||
raise ValueError("This option only be used with Lattice.")
|
||||
elif self.debug == 'joint':
|
||||
try:
|
||||
for img, table_bbox in self.debug_images:
|
||||
x_coord = []
|
||||
y_coord = []
|
||||
for k in table_bbox.keys():
|
||||
for coord in table_bbox[k]:
|
||||
x_coord.append(coord[0])
|
||||
y_coord.append(coord[1])
|
||||
max_x, max_y = max(x_coord), max(y_coord)
|
||||
plt.plot(x_coord, y_coord, 'ro')
|
||||
plt.axis([0, max_x + 100, max_y + 100, 0])
|
||||
plt.imshow(img)
|
||||
plt.show()
|
||||
except AttributeError:
|
||||
raise ValueError("This option only be used with Lattice.")
|
||||
elif self.debug == 'line':
|
||||
try:
|
||||
for v_s, h_s in self.debug_segments:
|
||||
for v in v_s:
|
||||
plt.plot([v[0], v[2]], [v[1], v[3]])
|
||||
for h in h_s:
|
||||
plt.plot([h[0], h[2]], [h[1], h[3]])
|
||||
plt.show()
|
||||
except AttributeError:
|
||||
raise ValueError("This option only be used with Lattice.")
|
||||
elif self.debug == 'table':
|
||||
try:
|
||||
for tables in self.debug_tables:
|
||||
for table in tables:
|
||||
for i in range(len(table.cells)):
|
||||
for j in range(len(table.cells[i])):
|
||||
if table.cells[i][j].left:
|
||||
plt.plot([table.cells[i][j].lb[0],
|
||||
table.cells[i][j].lt[0]],
|
||||
[table.cells[i][j].lb[1],
|
||||
table.cells[i][j].lt[1]])
|
||||
if table.cells[i][j].right:
|
||||
plt.plot([table.cells[i][j].rb[0],
|
||||
table.cells[i][j].rt[0]],
|
||||
[table.cells[i][j].rb[1],
|
||||
table.cells[i][j].rt[1]])
|
||||
if table.cells[i][j].top:
|
||||
plt.plot([table.cells[i][j].lt[0],
|
||||
table.cells[i][j].rt[0]],
|
||||
[table.cells[i][j].lt[1],
|
||||
table.cells[i][j].rt[1]])
|
||||
if table.cells[i][j].bottom:
|
||||
plt.plot([table.cells[i][j].lb[0],
|
||||
table.cells[i][j].rb[0]],
|
||||
[table.cells[i][j].lb[1],
|
||||
table.cells[i][j].rb[1]])
|
||||
plt.show()
|
||||
except AttributeError:
|
||||
raise ValueError("This option only be used with Lattice.")
|
||||
else:
|
||||
raise UserWarning("This method can only be called after"
|
||||
" debug has been specified.")
|
||||
|
|
@ -1,14 +1,26 @@
|
|||
from __future__ import print_function
|
||||
from __future__ import division
|
||||
import os
|
||||
import types
|
||||
import copy_reg
|
||||
import logging
|
||||
|
||||
import numpy as np
|
||||
|
||||
from .utils import get_column_index, encode_list
|
||||
from .table import Table
|
||||
from .utils import get_row_index, get_score, count_empty, encode_list, pdf_to_text
|
||||
|
||||
|
||||
__all__ = ['Stream']
|
||||
|
||||
|
||||
def _reduce_method(m):
|
||||
if m.im_self is None:
|
||||
return getattr, (m.im_class, m.im_func.func_name)
|
||||
else:
|
||||
return getattr, (m.im_self, m.im_func.func_name)
|
||||
copy_reg.pickle(types.MethodType, _reduce_method)
|
||||
|
||||
|
||||
def _group_rows(text, ytol=2):
|
||||
"""Groups text objects into rows using ytol.
|
||||
|
||||
|
|
@ -35,14 +47,16 @@ def _group_rows(text, ytol=2):
|
|||
# type(obj) is LTChar]):
|
||||
if t.get_text().strip():
|
||||
if not np.isclose(row_y, t.y0, atol=ytol):
|
||||
row_y = t.y0
|
||||
rows.append(temp)
|
||||
rows.append(sorted(temp, key=lambda t: t.x0))
|
||||
temp = []
|
||||
row_y = t.y0
|
||||
temp.append(t)
|
||||
rows.append(sorted(temp, key=lambda t: t.x0))
|
||||
__ = rows.pop(0) # hacky
|
||||
return rows
|
||||
|
||||
|
||||
def _merge_columns(l):
|
||||
def _merge_columns(l, mtol=2):
|
||||
"""Merges overlapping columns and returns list with updated
|
||||
columns boundaries.
|
||||
|
||||
|
|
@ -62,7 +76,8 @@ def _merge_columns(l):
|
|||
merged.append(higher)
|
||||
else:
|
||||
lower = merged[-1]
|
||||
if higher[0] <= lower[1]:
|
||||
if (higher[0] <= lower[1] or
|
||||
np.isclose(higher[0], lower[1], atol=mtol)):
|
||||
upper_bound = max(lower[1], higher[1])
|
||||
lower_bound = min(lower[0], higher[0])
|
||||
merged[-1] = (lower_bound, upper_bound)
|
||||
|
|
@ -71,6 +86,62 @@ def _merge_columns(l):
|
|||
return merged
|
||||
|
||||
|
||||
def _get_column_index(t, columns):
|
||||
"""Gets index of the column in which the given object falls by
|
||||
comparing their co-ordinates.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
t : object
|
||||
|
||||
columns : list
|
||||
|
||||
Returns
|
||||
-------
|
||||
c : int
|
||||
"""
|
||||
offset1, offset2 = 0, 0
|
||||
lt_col_overlap = []
|
||||
for c in columns:
|
||||
if c[0] <= t.x1 and c[1] >= t.x0:
|
||||
left = t.x0 if c[0] <= t.x0 else c[0]
|
||||
right = t.x1 if c[1] >= t.x1 else c[1]
|
||||
lt_col_overlap.append(abs(left - right) / abs(c[0] - c[1]))
|
||||
else:
|
||||
lt_col_overlap.append(-1)
|
||||
if len(filter(lambda x: x != -1, lt_col_overlap)) == 0:
|
||||
logging.warning("Text doesn't fit any column.")
|
||||
c_idx = lt_col_overlap.index(max(lt_col_overlap))
|
||||
if t.x0 < columns[c_idx][0]:
|
||||
offset1 = abs(t.x0 - columns[c_idx][0])
|
||||
if t.x1 > columns[c_idx][1]:
|
||||
offset2 = abs(t.x1 - columns[c_idx][1])
|
||||
Y = abs(t.y0 - t.y1)
|
||||
charea = abs(t.x0 - t.x1) * abs(t.y0 - t.y1)
|
||||
error = (Y * (offset1 + offset2)) / charea
|
||||
return c_idx, error
|
||||
|
||||
|
||||
def _add_columns(cols, text, ytolerance):
|
||||
if text:
|
||||
text = _group_rows(text, ytol=ytolerance)
|
||||
elements = [len(r) for r in text]
|
||||
new_cols = [(t.x0, t.x1)
|
||||
for r in text if len(r) == max(elements) for t in r]
|
||||
cols.extend(_merge_columns(sorted(new_cols)))
|
||||
return cols
|
||||
|
||||
|
||||
def _join_columns(cols, width):
|
||||
cols = sorted(cols)
|
||||
cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))]
|
||||
cols.insert(0, 0)
|
||||
cols.append(width) # or some tolerance
|
||||
cols = [(cols[i], cols[i + 1])
|
||||
for i in range(0, len(cols) - 1)]
|
||||
return cols
|
||||
|
||||
|
||||
class Stream:
|
||||
"""Stream algorithm
|
||||
|
||||
|
|
@ -105,20 +176,18 @@ class Stream:
|
|||
page as value.
|
||||
"""
|
||||
|
||||
def __init__(self, pdfobject, ncolumns=0, columns=None, ytol=2,
|
||||
debug=False, verbose=False):
|
||||
def __init__(self, ncolumns=0, columns=None, ytol=2, mtol=2,
|
||||
pdf_margin=(2.0, 0.5, 0.1), debug=False):
|
||||
|
||||
self.pdfobject = pdfobject
|
||||
self.method = 'stream'
|
||||
self.ncolumns = ncolumns
|
||||
self.columns = columns
|
||||
self.ytol = ytol
|
||||
self.mtol = mtol
|
||||
self.char_margin, self.line_margin, self.word_margin = pdf_margin
|
||||
self.debug = debug
|
||||
self.verbose = verbose
|
||||
self.tables = {}
|
||||
if self.debug:
|
||||
self.debug_text = {}
|
||||
|
||||
def get_tables(self):
|
||||
def get_tables(self, pdfname):
|
||||
"""Returns all tables found in given pdf.
|
||||
|
||||
Returns
|
||||
|
|
@ -127,86 +196,112 @@ class Stream:
|
|||
Dictionary with page number as key and list of tables on that
|
||||
page as value.
|
||||
"""
|
||||
vprint = print if self.verbose else lambda *a, **k: None
|
||||
self.pdfobject.split()
|
||||
for page in self.pdfobject.extract():
|
||||
p, __, text, __, __ = page
|
||||
pkey = 'pg-{0}'.format(p)
|
||||
__, text, width, height = pdf_to_text(pdfname, self.char_margin,
|
||||
self.line_margin, self.word_margin)
|
||||
bname, __ = os.path.splitext(pdfname)
|
||||
if not text:
|
||||
logging.warning("{0}: PDF has no text. It may be an image.".format(
|
||||
os.path.basename(bname)))
|
||||
return None
|
||||
text.sort(key=lambda x: (-x.y0, x.x0))
|
||||
|
||||
if self.debug:
|
||||
self.debug_text[pkey] = text
|
||||
self.debug_text = [(t.x0, t.y0, t.x1, t.y1) for t in text]
|
||||
return None
|
||||
|
||||
rows = _group_rows(text, ytol=self.ytol)
|
||||
elements = [len(r) for r in rows]
|
||||
# a table can't have just 1 column, can it?
|
||||
elements = filter(lambda x: x != 1, elements)
|
||||
rows_grouped = _group_rows(text, ytol=self.ytol)
|
||||
elements = [len(r) for r in rows_grouped]
|
||||
row_mids = [sum([(t.y0 + t.y1) / 2 for t in r]) / len(r)
|
||||
if len(r) > 0 else 0 for r in rows_grouped]
|
||||
rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))]
|
||||
rows.insert(0, height) # or some tolerance
|
||||
rows.append(0)
|
||||
rows = [(rows[i], rows[i + 1])
|
||||
for i in range(0, len(rows) - 1)]
|
||||
|
||||
guess = False
|
||||
if self.columns:
|
||||
# user has to input boundary columns too
|
||||
# take (0, width) by default
|
||||
# similar to else condition
|
||||
# len can't be 1
|
||||
cols = self.columns.split(',')
|
||||
cols = [(float(cols[i]), float(cols[i + 1]))
|
||||
for i in range(0, len(cols) - 1)]
|
||||
else:
|
||||
guess = True
|
||||
ncols = self.ncolumns if self.ncolumns else max(
|
||||
set(elements), key=elements.count)
|
||||
if ncols == 0:
|
||||
# no tables detected
|
||||
continue
|
||||
if self.ncolumns:
|
||||
ncols = self.ncolumns
|
||||
cols = [(t.x0, t.x1)
|
||||
for r in rows for t in r if len(r) == ncols]
|
||||
cols = _merge_columns(sorted(cols))
|
||||
cols = [(c[0] + c[1]) / 2.0 for c in cols]
|
||||
|
||||
ar = [['' for c in cols] for r in rows]
|
||||
for r_idx, r in enumerate(rows):
|
||||
for t in r:
|
||||
if guess:
|
||||
cog = (t.x0 + t.x1) / 2.0
|
||||
diff = [abs(cog - c) for c in cols]
|
||||
c_idx = diff.index(min(diff))
|
||||
for r in rows_grouped if len(r) == ncols for t in r]
|
||||
cols = _merge_columns(sorted(cols), mtol=self.mtol)
|
||||
if len(cols) != self.ncolumns:
|
||||
logging.warning("{}: The number of columns after merge"
|
||||
" isn't the same as what you specified."
|
||||
" Change the value of mtol.".format(
|
||||
os.path.basename(bname)))
|
||||
cols = _join_columns(cols, width)
|
||||
else:
|
||||
c_idx = get_column_index(t, cols)
|
||||
if None in [r_idx, c_idx]: # couldn't assign LTTextLH to any cell
|
||||
guess = True
|
||||
ncols = max(set(elements), key=elements.count)
|
||||
len_non_mode = len(filter(lambda x: x != ncols, elements))
|
||||
if ncols == 1 and not self.debug:
|
||||
# no tables detected
|
||||
logging.warning("{}: Only one column was detected, the PDF"
|
||||
" may have no tables. Specify ncols if"
|
||||
" the PDF has tables.".format(
|
||||
os.path.basename(bname)))
|
||||
cols = [(t.x0, t.x1)
|
||||
for r in rows_grouped if len(r) == ncols for t in r]
|
||||
cols = _merge_columns(sorted(cols), mtol=self.mtol)
|
||||
inner_text = []
|
||||
for i in range(1, len(cols)):
|
||||
left = cols[i - 1][1]
|
||||
right = cols[i][0]
|
||||
inner_text.extend([t for t in text if t.x0 > left and t.x1 < right])
|
||||
outer_text = [t for t in text if t.x0 > cols[-1][1] or t.x1 < cols[0][0]]
|
||||
inner_text.extend(outer_text)
|
||||
cols = _add_columns(cols, inner_text, self.ytol)
|
||||
cols = _join_columns(cols, width)
|
||||
|
||||
pdf_page = {}
|
||||
page_tables = {}
|
||||
table_info = {}
|
||||
table = Table(cols, rows)
|
||||
rerror = []
|
||||
cerror = []
|
||||
for row in rows_grouped:
|
||||
for t in row:
|
||||
try:
|
||||
r_idx, rass_error = get_row_index(t, rows)
|
||||
except ValueError as e:
|
||||
# couldn't assign LTTextLH to any cell
|
||||
vprint(e.message)
|
||||
continue
|
||||
if ar[r_idx][c_idx]:
|
||||
ar[r_idx][c_idx] = ' '.join(
|
||||
[ar[r_idx][c_idx], t.get_text().strip()])
|
||||
try:
|
||||
c_idx, cass_error = _get_column_index(t, cols)
|
||||
except ValueError as e:
|
||||
# couldn't assign LTTextLH to any cell
|
||||
vprint(e.message)
|
||||
continue
|
||||
rerror.append(rass_error)
|
||||
cerror.append(cass_error)
|
||||
table.cells[r_idx][c_idx].add_text(
|
||||
t.get_text().strip('\n'))
|
||||
if guess:
|
||||
score = get_score([[33, rerror], [33, cerror], [34, [len_non_mode / len(elements)]]])
|
||||
else:
|
||||
ar[r_idx][c_idx] = t.get_text().strip()
|
||||
vprint(pkey)
|
||||
self.tables[pkey] = [encode_list(ar)]
|
||||
score = get_score([[50, rerror], [50, cerror]])
|
||||
table_info['score'] = score
|
||||
ar = table.get_list()
|
||||
ar = encode_list(ar)
|
||||
table_info['data'] = ar
|
||||
empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)
|
||||
table_info['empty_p'] = empty_p
|
||||
table_info['r_nempty_cells'] = r_nempty_cells
|
||||
table_info['c_nempty_cells'] = c_nempty_cells
|
||||
table_info['nrows'] = len(ar)
|
||||
table_info['ncols'] = len(ar[0])
|
||||
page_tables['table_1'] = table_info
|
||||
pdf_page[os.path.basename(bname)] = page_tables
|
||||
|
||||
if self.pdfobject.clean:
|
||||
self.pdfobject.remove_tempdir()
|
||||
|
||||
if self.debug:
|
||||
return None
|
||||
|
||||
return self.tables
|
||||
|
||||
def plot_text(self):
|
||||
"""Plots all text objects so user can choose number of columns
|
||||
or columns x-coordinates using the matplotlib interface.
|
||||
"""
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib.patches as patches
|
||||
|
||||
for pkey in sorted(self.debug_text.keys()):
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111, aspect='equal')
|
||||
xs, ys = [], []
|
||||
for t in self.debug_text[pkey]:
|
||||
xs.extend([t.x0, t.x1])
|
||||
ys.extend([t.y0, t.y1])
|
||||
ax.add_patch(
|
||||
patches.Rectangle(
|
||||
(t.x0, t.y0),
|
||||
t.x1 - t.x0,
|
||||
t.y1 - t.y0
|
||||
)
|
||||
)
|
||||
ax.set_xlim(min(xs) - 10, max(xs) + 10)
|
||||
ax.set_ylim(min(ys) - 10, max(ys) + 10)
|
||||
plt.show()
|
||||
return pdf_page
|
||||
|
|
@ -26,6 +26,7 @@ class Table:
|
|||
self.rows = rows
|
||||
self.cells = [[Cell(c[0], r[1], c[1], r[0])
|
||||
for c in cols] for r in rows]
|
||||
self.nocont_ = 0
|
||||
|
||||
def set_edges(self, vertical, horizontal, jtol=2):
|
||||
"""Sets cell edges to True if corresponding line segments
|
||||
|
|
@ -53,6 +54,7 @@ class Table:
|
|||
k = [k for k, t in enumerate(self.rows)
|
||||
if np.isclose(v[1], t[0], atol=jtol)]
|
||||
if not j:
|
||||
self.nocont_ += 1
|
||||
continue
|
||||
J = j[0]
|
||||
if i == [0]: # only left edge
|
||||
|
|
@ -104,6 +106,7 @@ class Table:
|
|||
k = [k for k, t in enumerate(self.cols)
|
||||
if np.isclose(h[2], t[0], atol=jtol)]
|
||||
if not j:
|
||||
self.nocont_ += 1
|
||||
continue
|
||||
J = j[0]
|
||||
if i == [0]: # only top edge
|
||||
|
|
|
|||
168
camelot/utils.py
168
camelot/utils.py
|
|
@ -1,5 +1,18 @@
|
|||
from __future__ import division
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pdfminer.pdfparser import PDFParser
|
||||
from pdfminer.pdfdocument import PDFDocument
|
||||
from pdfminer.pdfpage import PDFPage
|
||||
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
|
||||
from pdfminer.pdfinterp import PDFResourceManager
|
||||
from pdfminer.pdfinterp import PDFPageInterpreter
|
||||
from pdfminer.pdfdevice import PDFDevice
|
||||
from pdfminer.converter import PDFPageAggregator
|
||||
from pdfminer.layout import LAParams, LTChar, LTTextLineHorizontal
|
||||
|
||||
|
||||
def translate(x1, x2):
|
||||
"""Translates x2 by x1.
|
||||
|
|
@ -243,15 +256,24 @@ def get_row_index(t, rows):
|
|||
----------
|
||||
t : object
|
||||
|
||||
rows : list
|
||||
rows : list, sorted in decreasing order
|
||||
|
||||
Returns
|
||||
-------
|
||||
r : int
|
||||
"""
|
||||
offset1, offset2 = 0, 0
|
||||
for r in range(len(rows)):
|
||||
if (t.y0 + t.y1) / 2.0 < rows[r][0] and (t.y0 + t.y1) / 2.0 > rows[r][1]:
|
||||
return r
|
||||
if t.y0 > rows[r][0]:
|
||||
offset1 = abs(t.y0 - rows[r][0])
|
||||
if t.y1 < rows[r][1]:
|
||||
offset2 = abs(t.y1 - rows[r][1])
|
||||
X = 1.0 if abs(t.x0 - t.x1) == 0.0 else abs(t.x0 - t.x1)
|
||||
Y = 1.0 if abs(t.y0 - t.y1) == 0.0 else abs(t.y0 - t.y1)
|
||||
charea = X * Y
|
||||
error = (X * (offset1 + offset2)) / charea
|
||||
return r, error
|
||||
|
||||
|
||||
def get_column_index(t, columns):
|
||||
|
|
@ -268,9 +290,45 @@ def get_column_index(t, columns):
|
|||
-------
|
||||
c : int
|
||||
"""
|
||||
offset1, offset2 = 0, 0
|
||||
for c in range(len(columns)):
|
||||
if (t.x0 + t.x1) / 2.0 > columns[c][0] and (t.x0 + t.x1) / 2.0 < columns[c][1]:
|
||||
return c
|
||||
if t.x0 < columns[c][0]:
|
||||
offset1 = abs(t.x0 - columns[c][0])
|
||||
if t.x1 > columns[c][1]:
|
||||
offset2 = abs(t.x1 - columns[c][1])
|
||||
X = 1.0 if abs(t.x0 - t.x1) == 0.0 else abs(t.x0 - t.x1)
|
||||
Y = 1.0 if abs(t.y0 - t.y1) == 0.0 else abs(t.y0 - t.y1)
|
||||
charea = X * Y
|
||||
error = (Y * (offset1 + offset2)) / charea
|
||||
return c, error
|
||||
|
||||
|
||||
def get_score(error_weights):
|
||||
"""Calculates score based on weights assigned to various parameters,
|
||||
and their error percentages.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
error_weights : dict
|
||||
Dict with a tuple of error percentages as key and weightage
|
||||
assigned to them as value. Sum of all values should be equal
|
||||
to 100.
|
||||
|
||||
Returns
|
||||
-------
|
||||
score : float
|
||||
"""
|
||||
SCORE_VAL = 100
|
||||
score = 0
|
||||
if sum([ew[0] for ew in error_weights]) != SCORE_VAL:
|
||||
raise ValueError("Please assign a valid weightage to each parameter"
|
||||
" such that their sum is equal to 100")
|
||||
for ew in error_weights:
|
||||
weight = ew[0] / len(ew[1])
|
||||
for error_percentage in ew[1]:
|
||||
score += weight * (1 - error_percentage)
|
||||
return score
|
||||
|
||||
|
||||
def reduce_index(t, rotated, r_idx, c_idx):
|
||||
|
|
@ -394,6 +452,110 @@ def remove_empty(d):
|
|||
return d
|
||||
|
||||
|
||||
def count_empty(d):
|
||||
"""Counts empty rows and columns from list of lists.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
d : list
|
||||
|
||||
Returns
|
||||
-------
|
||||
n_empty_rows : number of empty rows
|
||||
n_empty_cols : number of empty columns
|
||||
empty_p : percentage of empty cells
|
||||
"""
|
||||
empty_p = 0
|
||||
r_nempty_cells, c_nempty_cells = [], []
|
||||
for i in d:
|
||||
for j in i:
|
||||
if j.strip() == '':
|
||||
empty_p += 1
|
||||
empty_p = 100 * (empty_p / float(len(d) * len(d[0])))
|
||||
for row in d:
|
||||
r_nempty_c = 0
|
||||
for r in row:
|
||||
if r.strip() != '':
|
||||
r_nempty_c += 1
|
||||
r_nempty_cells.append(r_nempty_c)
|
||||
d = zip(*d)
|
||||
d = [list(col) for col in d]
|
||||
for col in d:
|
||||
c_nempty_c = 0
|
||||
for c in col:
|
||||
if c.strip() != '':
|
||||
c_nempty_c += 1
|
||||
c_nempty_cells.append(c_nempty_c)
|
||||
return empty_p, r_nempty_cells, c_nempty_cells
|
||||
|
||||
|
||||
def encode_list(ar):
|
||||
"""Encodes list of text.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
ar : list
|
||||
|
||||
Returns
|
||||
-------
|
||||
ar : list
|
||||
"""
|
||||
ar = [[r.encode('utf-8') for r in row] for row in ar]
|
||||
return ar
|
||||
|
||||
|
||||
def extract_text_objects(layout, LTObject, t=None):
|
||||
"""Recursively parses pdf layout to get a list of
|
||||
text objects.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
layout : object
|
||||
Layout object.
|
||||
|
||||
LTObject : object
|
||||
Text object, either LTChar or LTTextLineHorizontal.
|
||||
|
||||
t : list (optional, default: None)
|
||||
|
||||
Returns
|
||||
-------
|
||||
t : list
|
||||
List of text objects.
|
||||
"""
|
||||
if t is None:
|
||||
t = []
|
||||
try:
|
||||
for obj in layout._objs:
|
||||
if isinstance(obj, LTObject):
|
||||
t.append(obj)
|
||||
else:
|
||||
t += extract_text_objects(obj, LTObject)
|
||||
except AttributeError:
|
||||
pass
|
||||
return t
|
||||
|
||||
|
||||
def pdf_to_text(pname, char_margin, line_margin, word_margin):
|
||||
# pkey = 'page-{0}'.format(p)
|
||||
# pname = os.path.join(self.temp, '{}.pdf'.format(pkey))
|
||||
with open(pname, 'r') as f:
|
||||
parser = PDFParser(f)
|
||||
document = PDFDocument(parser)
|
||||
if not document.is_extractable:
|
||||
raise PDFTextExtractionNotAllowed
|
||||
laparams = LAParams(char_margin=char_margin,
|
||||
line_margin=line_margin,
|
||||
word_margin=word_margin)
|
||||
rsrcmgr = PDFResourceManager()
|
||||
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
|
||||
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||
for page in PDFPage.create_pages(document):
|
||||
interpreter.process_page(page)
|
||||
layout = device.get_result()
|
||||
lattice_objects = extract_text_objects(layout, LTChar)
|
||||
stream_objects = extract_text_objects(
|
||||
layout, LTTextLineHorizontal)
|
||||
width = layout.bbox[2]
|
||||
height = layout.bbox[3]
|
||||
return lattice_objects, stream_objects, width, height
|
||||
|
|
@ -39,7 +39,7 @@ Usage
|
|||
|
||||
>>> extractor = Lattice(Pdf('us-030.pdf'))
|
||||
>>> tables = extractor.get_tables()
|
||||
>>> print tables['pg-1']
|
||||
>>> print tables['page-1'][0]
|
||||
|
||||
.. csv-table::
|
||||
:header: "Cycle Name","KI (1/km)","Distance (mi)","Percent Fuel Savings","","",""
|
||||
|
|
|
|||
|
|
@ -65,7 +65,7 @@ Finally, the characters found on the page are assigned to cells based on their x
|
|||
|
||||
>>> extractor = Lattice(Pdf('us-030.pdf'))
|
||||
>>> tables = extractor.get_tables()
|
||||
>>> print tables['pg-1']
|
||||
>>> print tables['page-1'][0]
|
||||
|
||||
.. csv-table::
|
||||
:header: "Cycle Name","KI (1/km)","Distance (mi)","Percent Fuel Savings","","",""
|
||||
|
|
@ -114,7 +114,7 @@ In the PDF used above, you can see that some cells spanned a lot of rows, `fill`
|
|||
|
||||
>>> extractor = Lattice(Pdf('row_span_1.pdf'), fill='v', scale=40)
|
||||
>>> tables = extractor.get_tables()
|
||||
>>> print tables['pg-1']
|
||||
>>> print tables['page-1'][0]
|
||||
|
||||
.. csv-table::
|
||||
:header: "Plan Type","County","Plan Name","Totals"
|
||||
|
|
@ -173,7 +173,7 @@ To find line segments, Lattice needs the lines of the PDF to be in foreground. S
|
|||
|
||||
>>> extractor = Lattice(Pdf('lines_in_background_1.pdf'), invert=True)
|
||||
>>> tables = extractor.get_tables()
|
||||
>>> print tables['pg-1']
|
||||
>>> print tables['page-1'][0]
|
||||
|
||||
.. csv-table::
|
||||
:header: "State","Date","Halt stations","Halt days","Persons directly reached(in lakh)","Persons trained","Persons counseled","Persons testedfor HIV"
|
||||
|
|
|
|||
|
|
@ -17,7 +17,7 @@ Let's run it on this PDF.
|
|||
|
||||
>>> extractor = Stream(Pdf('eu-027.pdf'))
|
||||
>>> tables = extractor.get_tables()
|
||||
>>> print tables['pg-1']
|
||||
>>> print tables['page-1'][0]
|
||||
|
||||
.. .. _this: insert link for eu-027.pdf
|
||||
|
||||
|
|
@ -68,7 +68,7 @@ But sometimes its guess could be incorrect, like in this case.
|
|||
|
||||
>>> extractor = Stream(Pdf('missing_values.pdf'))
|
||||
>>> tables = extractor.get_tables()
|
||||
>>> print tables['pg-1']
|
||||
>>> print tables['page-1'][0]
|
||||
|
||||
.. .. _this: insert link for missing_values.pdf
|
||||
|
||||
|
|
@ -127,7 +127,7 @@ It guessed that the PDF has 3 columns, because there wasn't any data in the last
|
|||
|
||||
>>> extractor = Stream(Pdf('missing_values.pdf'), ncolumns=5)
|
||||
>>> tables = extractor.get_tables()
|
||||
>>> print tables['pg-1']
|
||||
>>> print tables['page-1'][0]
|
||||
|
||||
.. csv-table::
|
||||
|
||||
|
|
@ -200,7 +200,7 @@ After getting the x-coordinates, we just need to pass them to Stream, like this.
|
|||
|
||||
>>> extractor = Stream(Pdf('mexican_towns.pdf'), columns='28,67,180,230,425,475,700')
|
||||
>>> tables = extractor.get_tables()
|
||||
>>> print tables['pg-1']
|
||||
>>> print tables['page-1'][0]
|
||||
|
||||
.. csv-table::
|
||||
|
||||
|
|
|
|||
|
|
@ -26,7 +26,7 @@ def test_lattice_basic():
|
|||
extractor = Lattice(Pdf(pdfname,
|
||||
pagenos=[{'start': 2, 'end': 2}], clean=True))
|
||||
tables = extractor.get_tables()
|
||||
assert_equal(tables['pg-2'][0], data)
|
||||
assert_equal(tables['page-2'][0], data)
|
||||
|
||||
|
||||
def test_lattice_fill():
|
||||
|
|
@ -76,7 +76,7 @@ def test_lattice_fill():
|
|||
pdfname = os.path.join(testdir, 'row_span_1.pdf')
|
||||
extractor = Lattice(Pdf(pdfname, clean=True), fill='v', scale=40)
|
||||
tables = extractor.get_tables()
|
||||
assert_equal(tables['pg-1'][0], data)
|
||||
assert_equal(tables['pagea-1'][0], data)
|
||||
|
||||
|
||||
def test_lattice_invert():
|
||||
|
|
@ -94,4 +94,4 @@ def test_lattice_invert():
|
|||
pdfname = os.path.join(testdir, 'lines_in_background_1.pdf')
|
||||
extractor = Lattice(Pdf(pdfname, clean=True), invert=True)
|
||||
tables = extractor.get_tables()
|
||||
assert_equal(tables['pg-1'][1], data)
|
||||
assert_equal(tables['page-1'][1], data)
|
||||
|
|
@ -13,57 +13,62 @@ testdir = os.path.dirname(os.path.abspath(__file__))
|
|||
def test_stream_basic():
|
||||
|
||||
data = [
|
||||
["","","","",""],
|
||||
["C Appendix C: Summary Statistics","","","",""],
|
||||
["","Table C1: Summary Statistics","","",""],
|
||||
["","This table contains summary statistics for 2,012 respondents in SAVE 2009.","","",""],
|
||||
["Variable","Mean","Std. Dev. Min","","Max"],
|
||||
["Age","50.8","15.9","21","90"],
|
||||
["Men","0.47","0.50","0","1"],
|
||||
["East","0.28","0.45","0","1"],
|
||||
["Rural","0.15","0.36","0","1"],
|
||||
["Married","0.57","0.50","0","1"],
|
||||
["Single","0.21","0.40","0","1"],
|
||||
["Divorced","0.13","0.33","0","1"],
|
||||
["Widowed","0.08","0.26","0","1"],
|
||||
["Separated","0.03","0.16","0","1"],
|
||||
["Partner","0.65","0.48","0","1"],
|
||||
["Employed","0.55","0.50","0","1"],
|
||||
["Fulltime","0.34","0.47","0","1"],
|
||||
["Parttime","0.20","0.40","0","1"],
|
||||
["Unemployed","0.08","0.28","0","1"],
|
||||
["Homemaker","0.19","0.40","0","1"],
|
||||
["Retired","0.28","0.45","0","1"],
|
||||
["Household size","2.43","1.22","1","9"],
|
||||
["Households with children","0.37","0.48","0","1"],
|
||||
["Number of children","1.67","1.38","0","8"],
|
||||
["Lower secondary education","0.08","0.27","0","1"],
|
||||
["Upper secondary education","0.60","0.49","0","1"],
|
||||
["Post secondary, non tert. education","0.12","0.33","0","1"],
|
||||
["First stage tertiary education","0.17","0.38","0","1"],
|
||||
["Other education","0.03","0.17","0","1"],
|
||||
["Household income (Euro/month)","2,127","1,389","22","22,500"],
|
||||
["Gross wealth - end of 2007 (Euro)","187,281","384,198","0","7,720,000"],
|
||||
["Gross financial wealth - end of 2007 (Euro)","38,855","114,128","0","2,870,000"],
|
||||
["","Source: SAVE 2008 and 2009, data is weighted and imputed.","","",""],
|
||||
["","","","","ECB"],
|
||||
["","","","","Working Paper Series No 1299"],
|
||||
["","","","","Febuary 2011"]
|
||||
["Clave","Nombre Entidad","Clave","Nombre Municipio","Clave","Nombre Localidad"],
|
||||
["Entidad","","Municipio","","Localidad",""],
|
||||
["01","Aguascalientes","001","Aguascalientes","0094","Granja Adelita"],
|
||||
["01","Aguascalientes","001","Aguascalientes","0096","Agua Azul"],
|
||||
["01","Aguascalientes","001","Aguascalientes","0100","Rancho Alegre"],
|
||||
["01","Aguascalientes","001","Aguascalientes","0102","Los Arbolitos [Rancho]"],
|
||||
["01","Aguascalientes","001","Aguascalientes","0104","Ardillas de Abajo (Las Ardillas)"],
|
||||
["01","Aguascalientes","001","Aguascalientes","0106","Arellano"],
|
||||
["01","Aguascalientes","001","Aguascalientes","0112","Bajío los Vázquez"],
|
||||
["01","Aguascalientes","001","Aguascalientes","0113","Bajío de Montoro"],
|
||||
["01","Aguascalientes","001","Aguascalientes","0114","Residencial San Nicolás [Baños la Cantera]"],
|
||||
["01","Aguascalientes","001","Aguascalientes","0120","Buenavista de Peñuelas"],
|
||||
["01","Aguascalientes","001","Aguascalientes","0121","Cabecita 3 Marías (Rancho Nuevo)"],
|
||||
["01","Aguascalientes","001","Aguascalientes","0125","Cañada Grande de Cotorina"],
|
||||
["01","Aguascalientes","001","Aguascalientes","0126","Cañada Honda [Estación]"],
|
||||
["01","Aguascalientes","001","Aguascalientes","0127","Los Caños"],
|
||||
["01","Aguascalientes","001","Aguascalientes","0128","El Cariñán"],
|
||||
["01","Aguascalientes","001","Aguascalientes","0129","El Carmen [Granja]"],
|
||||
["01","Aguascalientes","001","Aguascalientes","0135","El Cedazo (Cedazo de San Antonio)"],
|
||||
["01","Aguascalientes","001","Aguascalientes","0138","Centro de Arriba (El Taray)"],
|
||||
["01","Aguascalientes","001","Aguascalientes","0139","Cieneguilla (La Lumbrera)"],
|
||||
["01","Aguascalientes","001","Aguascalientes","0141","Cobos"],
|
||||
["01","Aguascalientes","001","Aguascalientes","0144","El Colorado (El Soyatal)"],
|
||||
["01","Aguascalientes","001","Aguascalientes","0146","El Conejal"],
|
||||
["01","Aguascalientes","001","Aguascalientes","0157","Cotorina de Abajo"],
|
||||
["01","Aguascalientes","001","Aguascalientes","0162","Coyotes"],
|
||||
["01","Aguascalientes","001","Aguascalientes","0166","La Huerta (La Cruz)"],
|
||||
["01","Aguascalientes","001","Aguascalientes","0170","Cuauhtémoc (Las Palomas)"],
|
||||
["01","Aguascalientes","001","Aguascalientes","0171","Los Cuervos (Los Ojos de Agua)"],
|
||||
["01","Aguascalientes","001","Aguascalientes","0172","San José [Granja]"],
|
||||
["01","Aguascalientes","001","Aguascalientes","0176","La Chiripa"],
|
||||
["01","Aguascalientes","001","Aguascalientes","0182","Dolores"],
|
||||
["01","Aguascalientes","001","Aguascalientes","0183","Los Dolores"],
|
||||
["01","Aguascalientes","001","Aguascalientes","0190","El Duraznillo"],
|
||||
["01","Aguascalientes","001","Aguascalientes","0191","Los Durón"],
|
||||
["01","Aguascalientes","001","Aguascalientes","0197","La Escondida"],
|
||||
["01","Aguascalientes","001","Aguascalientes","0201","Brande Vin [Bodegas]"],
|
||||
["01","Aguascalientes","001","Aguascalientes","0207","Valle Redondo"],
|
||||
["01","Aguascalientes","001","Aguascalientes","0209","La Fortuna"],
|
||||
["01","Aguascalientes","001","Aguascalientes","0212","Lomas del Gachupín"],
|
||||
["01","Aguascalientes","001","Aguascalientes","0213","El Carmen (Gallinas Güeras) [Rancho]"],
|
||||
["01","Aguascalientes","001","Aguascalientes","0216","La Gloria"],
|
||||
["01","Aguascalientes","001","Aguascalientes","0226","Hacienda Nueva"],
|
||||
]
|
||||
|
||||
pdfname = os.path.join(testdir,
|
||||
"tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-027.pdf")
|
||||
extractor = Stream(Pdf(pdfname, pagenos=[{'start': 3, 'end': 3}],
|
||||
pdfname = os.path.join(testdir, 'mexican_towns.pdf')
|
||||
extractor = Stream(Pdf(pdfname, pagenos=[{'start': 1, 'end': 1}],
|
||||
clean=True))
|
||||
tables = extractor.get_tables()
|
||||
assert_equal(tables['pg-3'][0], data)
|
||||
assert_equal(tables['page-1'][0], data)
|
||||
|
||||
|
||||
def test_stream_ncolumns():
|
||||
|
||||
data = [
|
||||
["","","","",""],
|
||||
["","Bhandara - Key Indicators","","",""],
|
||||
["Bhandara - Key Indicators","","","",""],
|
||||
["","DLHS-4 (2012-13)","","DLHS-3 (2007-08)",""],
|
||||
["Indicators","TOTAL","RURAL","TOTAL","RURAL"],
|
||||
["Reported Prevalence of Morbidity","","","",""],
|
||||
|
|
@ -105,21 +110,20 @@ def test_stream_ncolumns():
|
|||
["Moderately High (Systolic >160 mm of Hg & Diastolic >100 mm of Hg ) .....................................","8.2","7.1","",""],
|
||||
["Very High (Systolic >180 mm of Hg & Diastolic >110 mm of Hg ) ...............................................","3.7","3.1","",""],
|
||||
["14 Any anaemia below 11g/dl, severe anaemia below 7g/dl. 15 Excluding age group 19 years","","","",""],
|
||||
["","Chronic Illness :Any person with symptoms persisting for longer than one month is defined as suffering from chronic illness","","",""]
|
||||
["Chronic Illness :Any person with symptoms persisting for longer than one month is defined as suffering from chronic illness","","","",""],
|
||||
["","4","","",""]
|
||||
]
|
||||
pdfname = os.path.join(testdir, 'missing_values.pdf')
|
||||
extractor = Stream(Pdf(pdfname, char_margin=1.0, clean=True),
|
||||
ncolumns=5)
|
||||
tables = extractor.get_tables()
|
||||
assert_equal(tables['pg-1'][0], data)
|
||||
assert_equal(tables['page-1'][0], data)
|
||||
|
||||
|
||||
def test_stream_columns():
|
||||
|
||||
data = [
|
||||
["","","","","",""],
|
||||
["Clave","","Clave","","Clave",""],
|
||||
["","Nombre Entidad","","Nombre Municipio","","Nombre Localidad"],
|
||||
["Clave","Nombre Entidad","Clave","Nombre Municipio","Clave","Nombre Localidad"],
|
||||
["Entidad","","Municipio","","Localidad",""],
|
||||
["01","Aguascalientes","001","Aguascalientes","0094","Granja Adelita"],
|
||||
["01","Aguascalientes","001","Aguascalientes","0096","Agua Azul"],
|
||||
|
|
@ -160,10 +164,11 @@ def test_stream_columns():
|
|||
["01","Aguascalientes","001","Aguascalientes","0209","La Fortuna"],
|
||||
["01","Aguascalientes","001","Aguascalientes","0212","Lomas del Gachupín"],
|
||||
["01","Aguascalientes","001","Aguascalientes","0213","El Carmen (Gallinas Güeras) [Rancho]"],
|
||||
["01","Aguascalientes","001","Aguascalientes","0216","La Gloria"]
|
||||
["01","Aguascalientes","001","Aguascalientes","0216","La Gloria"],
|
||||
["01","Aguascalientes","001","Aguascalientes","0226","Hacienda Nueva"],
|
||||
]
|
||||
pdfname = os.path.join(testdir, 'mexican_towns.pdf')
|
||||
extractor = Stream(Pdf(pdfname, clean=True),
|
||||
columns='28,67,180,230,425,475,700')
|
||||
tables = extractor.get_tables()
|
||||
assert_equal(tables['pg-1'][0], data)
|
||||
assert_equal(tables['page-1'][0], data)
|
||||
351
tools/camelot
351
tools/camelot
|
|
@ -4,8 +4,12 @@ import os
|
|||
import sys
|
||||
import time
|
||||
import logging
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
from docopt import docopt
|
||||
from collections import Counter
|
||||
import matplotlib.pyplot as plt
|
||||
from PyPDF2 import PdfFileReader
|
||||
|
||||
from camelot.pdf import Pdf
|
||||
|
|
@ -22,12 +26,23 @@ usage:
|
|||
options:
|
||||
-h, --help Show this screen.
|
||||
-v, --version Show version.
|
||||
-V, --verbose Verbose.
|
||||
-p, --pages <pageno> Comma-separated list of page numbers.
|
||||
Example: -p 1,3-6,10 [default: 1]
|
||||
-P, --parallel Parallelize the parsing process.
|
||||
-f, --format <format> Output format. (csv,tsv,html,json,xlsx) [default: csv]
|
||||
-l, --log Print log to file.
|
||||
-V, --verbose Verbose.
|
||||
-l, --log Log to file.
|
||||
-o, --output <directory> Output directory.
|
||||
-M, --cmargin <cmargin> Char margin. Chars closer than cmargin are
|
||||
grouped together to form a word. [default: 2.0]
|
||||
-L, --lmargin <lmargin> Line margin. Lines closer than lmargin are
|
||||
grouped together to form a textbox. [default: 0.5]
|
||||
-W, --wmargin <wmargin> Word margin. Insert blank spaces between chars
|
||||
if distance between words is greater than word
|
||||
margin. [default: 0.1]
|
||||
-S, --save-info Save parsing info for each page to a file.
|
||||
-X, --plot <dist> Plot distributions. (page,all,rc)
|
||||
-Z, --summary Summarize metrics.
|
||||
|
||||
camelot methods:
|
||||
lattice Looks for lines between data.
|
||||
|
|
@ -47,12 +62,12 @@ options:
|
|||
cells. Example: -F h, -F v, -F hv
|
||||
-s, --scale <scale> Scaling factor. Large scaling factor leads to
|
||||
smaller lines being detected. [default: 15]
|
||||
-i, --invert Invert pdf image to make sure that lines are
|
||||
in foreground.
|
||||
-j, --jtol <jtol> Tolerance to account for when comparing joint
|
||||
and line coordinates. [default: 2]
|
||||
-m, --mtol <mtol> Tolerance to account for when merging lines
|
||||
which are very close. [default: 2]
|
||||
-i, --invert Invert pdf image to make sure that lines are
|
||||
in foreground.
|
||||
-d, --debug <debug> Debug by visualizing pdf geometry.
|
||||
(contour,line,joint,table) Example: -d table
|
||||
"""
|
||||
|
|
@ -69,17 +84,159 @@ options:
|
|||
Example: -c 10.1,20.2,30.3
|
||||
-y, --ytol <ytol> Tolerance to account for when grouping rows
|
||||
together. [default: 2]
|
||||
-M, --cmargin <cmargin> Char margin. Chars closer than cmargin are
|
||||
grouped together to form a word. [default: 2.0]
|
||||
-L, --lmargin <lmargin> Line margin. Lines closer than lmargin are
|
||||
grouped together to form a textbox. [default: 0.5]
|
||||
-W, --wmargin <wmargin> Word margin. Insert blank spaces between chars
|
||||
if distance between words is greater than word
|
||||
margin. [default: 0.1]
|
||||
-m, --mtol <mtol> Tolerance to account for when merging columns
|
||||
together. [default: 2]
|
||||
-d, --debug Debug by visualizing textboxes.
|
||||
"""
|
||||
|
||||
|
||||
def plot_table_barchart(r, c, p, pno, tno):
|
||||
row_idx = [i + 1 for i, row in enumerate(r)]
|
||||
col_idx = [i + 1 for i, col in enumerate(c)]
|
||||
r_index = np.arange(len(r))
|
||||
c_index = np.arange(len(c))
|
||||
width = 0.7
|
||||
|
||||
plt.figure(figsize=(8, 6))
|
||||
plt.subplot(2, 1, 1)
|
||||
plt.title('Percentage of empty cells in table: {0:.2f}'.format(p))
|
||||
plt.xlabel('row index')
|
||||
plt.ylabel('number of non-empty cells in row')
|
||||
plt.bar(r_index, r)
|
||||
plt.xticks(r_index + width * 0.5, row_idx)
|
||||
plt.ylim(0, len(c))
|
||||
|
||||
plt.subplot(2, 1, 2)
|
||||
plt.xlabel('column index')
|
||||
plt.ylabel('number of non-empty cells in column')
|
||||
plt.bar(c_index, c)
|
||||
plt.xticks(c_index + width * 0.5, col_idx)
|
||||
plt.ylim(0, len(r))
|
||||
plt.savefig(''.join([pno, '_', tno, '.png']), dpi=300)
|
||||
|
||||
|
||||
def plot_all_barchart(data, output):
|
||||
r_empty_cells = []
|
||||
for page_number in data.keys():
|
||||
page = data[page_number]
|
||||
for table_number in page.keys():
|
||||
table = page[table_number]
|
||||
r_empty_cells.extend([r / float(table['ncols']) for r in table['r_nempty_cells']])
|
||||
c = Counter(r_empty_cells)
|
||||
if 0.0 not in c:
|
||||
c.update({0.0: 0})
|
||||
if 1.0 not in c:
|
||||
c.update({1.0: 0})
|
||||
|
||||
plt.figure(figsize=(8, 6))
|
||||
plt.xlabel('percentage of non-empty cells in a row')
|
||||
plt.ylabel('percentage of rows processed')
|
||||
row_p = [count / float(sum(c.values())) for count in c.values()]
|
||||
plt.bar(c.keys(), row_p, align='center', width=0.05)
|
||||
plt.ylim(0, 1.0)
|
||||
plt.savefig(''.join([output, '_all.png']), dpi=300)
|
||||
|
||||
|
||||
def plot_rc_piechart(data, output):
|
||||
from matplotlib import cm
|
||||
|
||||
tables = 0
|
||||
rows, cols = [], []
|
||||
for page_number in data.keys():
|
||||
page = data[page_number]
|
||||
for table_number in page.keys():
|
||||
table = page[table_number]
|
||||
tables += 1
|
||||
rows.append(table['nrows'])
|
||||
cols.append(table['ncols'])
|
||||
|
||||
r = Counter(rows)
|
||||
c = Counter(cols)
|
||||
|
||||
plt.figure(figsize=(8, 6))
|
||||
cs1 = cm.Set1(np.arange(len(r)) / float(len(r)))
|
||||
ax1 = plt.subplot(211, aspect='equal')
|
||||
ax1.pie(r.values(), colors=cs1, labels=r.keys(), startangle=90)
|
||||
ax1.set_title('row distribution across tables')
|
||||
|
||||
cs2 = cm.Set1(np.arange(len(c)) / float(len(c)))
|
||||
ax2 = plt.subplot(212, aspect='equal')
|
||||
ax2.pie(c.values(), colors=cs2, labels=c.keys(), startangle=90)
|
||||
ax2.set_title('column distribution across tables')
|
||||
plt.savefig(''.join([output, '_rc.png']), dpi=300)
|
||||
|
||||
|
||||
def summary(data, p_time):
|
||||
from operator import itemgetter
|
||||
from itertools import groupby
|
||||
|
||||
scores = []
|
||||
continuous_tables = []
|
||||
total_tables = 0
|
||||
for page_number in data.keys():
|
||||
page = data[page_number]
|
||||
total_tables += len(page.keys())
|
||||
for table_number in page.keys():
|
||||
table = page[table_number]
|
||||
continuous_tables.append((page_number, table_number, table['ncols']))
|
||||
scores.append(table['score'])
|
||||
avg_score = np.mean(scores)
|
||||
|
||||
ct_pages = []
|
||||
header_string = ""
|
||||
if len(continuous_tables) > 1:
|
||||
tables = sorted(continuous_tables, key=lambda x: (int(x[0][5:]), int(x[1][6:])))
|
||||
for k, g in groupby(tables, key=itemgetter(2)):
|
||||
g = list(g)
|
||||
tables_same_ncols = set([int(t[0][5:]) for t in g])
|
||||
tables_same_ncols = sorted(list(tables_same_ncols))
|
||||
for K, G in groupby(enumerate(tables_same_ncols), key=lambda (i, x): i - x):
|
||||
G = list(G)
|
||||
ct_pages.append((str(G[0][1]), str(G[-1][1])))
|
||||
|
||||
result_headers = []
|
||||
for ct in ct_pages:
|
||||
header_idx = {}
|
||||
possible_headers = []
|
||||
ncols = 0
|
||||
for page_number in range(int(ct[0]), int(ct[1]) + 1):
|
||||
page = data['page-{0}'.format(page_number)]
|
||||
for table_number in page.keys():
|
||||
table = page[table_number]
|
||||
ncols = table['ncols']
|
||||
for i, row in enumerate(table['data']):
|
||||
try:
|
||||
header_idx[tuple(row)].append(i)
|
||||
except KeyError:
|
||||
header_idx[tuple(row)] = [i]
|
||||
possible_headers = sorted(header_idx, key=lambda k: len(header_idx[k]), reverse=True)[:10]
|
||||
possible_headers = filter(lambda z: len(z) == ncols,
|
||||
[filter(lambda x: x != '', p_h) for p_h in possible_headers])
|
||||
modes = []
|
||||
for p_h in possible_headers:
|
||||
try:
|
||||
modes.append((p_h, max(set(header_idx[p_h]), key=header_idx[p_h].count)))
|
||||
except KeyError:
|
||||
pass
|
||||
header = modes[modes.index(min(modes, key=lambda x: x[1]))][0]
|
||||
result_headers.append(header)
|
||||
|
||||
header_string = "Multi-page table headers*:\n"
|
||||
header_string = ''.join([header_string, '\n'.join(['pages {0} -> {1}{2}{3}'.format(
|
||||
'-'.join([cr[0][0], cr[0][1]]), '"', '","'.join(cr[1]), '"') for cr in zip(
|
||||
ct_pages, result_headers)])])
|
||||
|
||||
avg_time = "Time taken per page: {0:.2f} seconds\n".format(
|
||||
p_time / float(len(data))) if len(data) != 1 else ""
|
||||
equal_ncols = "\nMulti-page tables on*: {0}\n".format(
|
||||
', '.join(['-'.join(ct) for ct in ct_pages])) if len(data) != 1 else ""
|
||||
stats = [len(data), p_time, avg_time, total_tables, avg_score, equal_ncols]
|
||||
stat_string = ("Pages processed: {0}\nTime taken: {1:.2f} seconds\n"
|
||||
"{2}Tables found: {3}\nAverage score: {4:.2f}{5}".format(*stats))
|
||||
|
||||
print(''.join([stat_string, header_string]))
|
||||
|
||||
|
||||
def convert_to_html(table):
|
||||
html = ''
|
||||
html = ''.join([html, '<table border="1">\n'])
|
||||
|
|
@ -99,23 +256,23 @@ def write_to_disk(data, f='csv', output=None, filename=None):
|
|||
if f in ['csv', 'tsv']:
|
||||
import csv
|
||||
delimiter = ',' if f == 'csv' else '\t'
|
||||
for page in sorted(data):
|
||||
for table in range(len(data[page])):
|
||||
dsvname = '{0}_table_{1}.{2}'.format(page, table + 1, f)
|
||||
for page_number in sorted(data.keys()):
|
||||
for table_number in sorted(data[page_number].keys()):
|
||||
dsvname = '{0}.{1}'.format(''.join([page_number, '_', table_number]), f)
|
||||
with open(os.path.join(output, dsvname), 'w') as outfile:
|
||||
writer = csv.writer(
|
||||
outfile, delimiter=delimiter, quoting=csv.QUOTE_ALL)
|
||||
for row in data[page][table]:
|
||||
for row in data[page_number][table_number]['data']:
|
||||
writer.writerow(row)
|
||||
elif f == 'html':
|
||||
htmlname = '{}.html'.format(froot)
|
||||
for page in sorted(data):
|
||||
for table in range(len(data[page])):
|
||||
htmlname = '{0}.html'.format(froot)
|
||||
for page_number in sorted(data.keys()):
|
||||
for table_number in sorted(data[page_number].keys()):
|
||||
with open(os.path.join(output, htmlname), 'a') as htmlfile:
|
||||
htmlfile.write(convert_to_html(data[page][table]))
|
||||
htmlfile.write(convert_to_html(data[page_number][table_number]['data']))
|
||||
elif f == 'json':
|
||||
import json
|
||||
with open(os.path.join(output, '{}.json'.format(froot)), 'w') \
|
||||
with open(os.path.join(output, '{0}.json'.format(froot)), 'w') \
|
||||
as jsonfile:
|
||||
json.dump(data, jsonfile)
|
||||
elif f == 'xlsx':
|
||||
|
|
@ -123,12 +280,12 @@ def write_to_disk(data, f='csv', output=None, filename=None):
|
|||
from pyexcel_xlsx import save_data
|
||||
from collections import OrderedDict
|
||||
xlsx_data = OrderedDict()
|
||||
for page in sorted(data):
|
||||
for table in range(len(data[page])):
|
||||
sheet_name = '{0}_table_{1}'.format(page, table + 1)
|
||||
for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
|
||||
for table_number in sorted(data[page_number].keys(), key=lambda x: int(x[6:])):
|
||||
sheet_name = ''.join([page_number, '_', table_number])
|
||||
xlsx_data.update({sheet_name:
|
||||
[row for row in data[page][table]]})
|
||||
save_data(os.path.join(output, '{}.xlsx'.format(froot)), xlsx_data)
|
||||
[row for row in data[page_number][table_number]['data']]})
|
||||
save_data(os.path.join(output, '{0}.xlsx'.format(froot)), xlsx_data)
|
||||
except ImportError:
|
||||
print("link to install docs")
|
||||
|
||||
|
|
@ -147,16 +304,17 @@ if __name__ == '__main__':
|
|||
filename = args['<file>']
|
||||
filedir = os.path.dirname(args['<file>'])
|
||||
logname, __ = os.path.splitext(filename)
|
||||
logname += '.log'
|
||||
logname = ''.join([logname, '.log'])
|
||||
scorename, __ = os.path.splitext(filename)
|
||||
scorename = ''.join([scorename, '_info.csv'])
|
||||
pngname, __ = os.path.splitext(filename)
|
||||
|
||||
if args['--log']:
|
||||
FORMAT = '%(asctime)s - %(levelname)s - %(message)s'
|
||||
if args['--output']:
|
||||
logname = os.path.join(args['--output'], os.path.basename(logname))
|
||||
logging.basicConfig(
|
||||
filename=logname, filemode='w', level=logging.DEBUG)
|
||||
else:
|
||||
logging.basicConfig(
|
||||
filename=logname, filemode='w', level=logging.DEBUG)
|
||||
filename=logname, filemode='w', format=FORMAT, level=logging.DEBUG)
|
||||
|
||||
p = []
|
||||
if args['--pages'] == '1':
|
||||
|
|
@ -173,47 +331,142 @@ if __name__ == '__main__':
|
|||
else:
|
||||
p.append({'start': int(r), 'end': int(r)})
|
||||
|
||||
margin_tuple = (float(args['--cmargin']), float(args['--lmargin']),
|
||||
float(args['--wmargin']))
|
||||
if args['<method>'] == 'lattice':
|
||||
try:
|
||||
extractor = Lattice(Pdf(filename, pagenos=p, clean=True),
|
||||
manager = Pdf(Lattice(
|
||||
fill=args['--fill'],
|
||||
scale=int(args['--scale']),
|
||||
invert=args['--invert'],
|
||||
jtol=int(args['--jtol']),
|
||||
mtol=int(args['--mtol']),
|
||||
invert=args['--invert'],
|
||||
debug=args['--debug'],
|
||||
verbose=args['--verbose'])
|
||||
data = extractor.get_tables()
|
||||
pdf_margin=margin_tuple,
|
||||
debug=args['--debug']),
|
||||
filename,
|
||||
pagenos=p,
|
||||
parallel=args['--parallel'],
|
||||
clean=True)
|
||||
data = manager.extract()
|
||||
|
||||
processing_time = time.time() - start_time
|
||||
vprint("Finished processing in", processing_time, "seconds")
|
||||
logging.info("Finished processing in " + str(processing_time) + " seconds")
|
||||
|
||||
if args['--plot']:
|
||||
if args['--output']:
|
||||
pngname = os.path.join(args['--output'], os.path.basename(pngname))
|
||||
plot_type = args['--plot'].split(',')
|
||||
if 'page' in plot_type:
|
||||
for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
|
||||
page = data[page_number]
|
||||
for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
|
||||
table = page[table_number]
|
||||
plot_table_barchart(table['r_nempty_cells'],
|
||||
table['c_nempty_cells'],
|
||||
table['empty_p'],
|
||||
page_number,
|
||||
table_number)
|
||||
|
||||
if 'all' in plot_type:
|
||||
plot_all_barchart(data, pngname)
|
||||
|
||||
if 'rc' in plot_type:
|
||||
plot_rc_piechart(data, pngname)
|
||||
|
||||
if args['--summary']:
|
||||
summary(data, processing_time)
|
||||
|
||||
if args['--save-info']:
|
||||
if args['--output']:
|
||||
scorename = os.path.join(args['--output'], os.path.basename(scorename))
|
||||
with open(scorename, 'w') as score_file:
|
||||
score_file.write('table,nrows,ncols,empty_p,line_p,text_p,score\n')
|
||||
for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
|
||||
page = data[page_number]
|
||||
for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
|
||||
table = page[table_number]
|
||||
score_file.write('{0},{1},{2},{3},{4},{5},{6}\n'.format(
|
||||
''.join([page_number, '_', table_number]),
|
||||
table['nrows'],
|
||||
table['ncols'],
|
||||
table['empty_p'],
|
||||
table['line_p'],
|
||||
table['text_p'],
|
||||
table['score']))
|
||||
if args['--debug']:
|
||||
extractor.plot_geometry(args['--debug'])
|
||||
manager.debug_plot()
|
||||
except Exception as e:
|
||||
logging.exception(e.message, exc_info=True)
|
||||
sys.exit()
|
||||
elif args['<method>'] == 'stream':
|
||||
try:
|
||||
extractor = Stream(Pdf(filename, pagenos=p,
|
||||
char_margin=float(args['--cmargin']),
|
||||
line_margin=float(args['--lmargin']),
|
||||
word_margin=float(args['--wmargin']),
|
||||
clean=True),
|
||||
manager = Pdf(Stream(
|
||||
ncolumns=int(args['--ncols']),
|
||||
columns=args['--columns'],
|
||||
ytol=int(args['--ytol']),
|
||||
debug=args['--debug'],
|
||||
verbose=args['--verbose'])
|
||||
data = extractor.get_tables()
|
||||
mtol=int(args['--mtol']),
|
||||
pdf_margin=margin_tuple,
|
||||
debug=args['--debug']),
|
||||
filename,
|
||||
pagenos=p,
|
||||
parallel=args['--parallel'],
|
||||
clean=True)
|
||||
data = manager.extract()
|
||||
|
||||
processing_time = time.time() - start_time
|
||||
vprint("Finished processing in", processing_time, "seconds")
|
||||
logging.info("Finished processing in " + str(processing_time) + " seconds")
|
||||
|
||||
if args['--plot']:
|
||||
if args['--output']:
|
||||
pngname = os.path.join(args['--output'], os.path.basename(pngname))
|
||||
plot_type = args['--plot'].split(',')
|
||||
if 'page' in plot_type:
|
||||
for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
|
||||
page = data[page_number]
|
||||
for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
|
||||
table = page[table_number]
|
||||
plot_table_barchart(table['r_nempty_cells'],
|
||||
table['c_nempty_cells'],
|
||||
table['empty_p'],
|
||||
page_number,
|
||||
table_number)
|
||||
|
||||
if 'all' in plot_type:
|
||||
plot_all_barchart(data, pngname)
|
||||
|
||||
if 'rc' in plot_type:
|
||||
plot_rc_piechart(data, pngname)
|
||||
|
||||
if args['--summary']:
|
||||
summary(data, processing_time)
|
||||
|
||||
if args['--save-info']:
|
||||
if args['--output']:
|
||||
scorename = os.path.join(args['--output'], os.path.basename(scorename))
|
||||
with open(scorename, 'w') as score_file:
|
||||
score_file.write('table,nrows,ncols,empty_p,,score\n')
|
||||
for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
|
||||
page = data[page_number]
|
||||
for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
|
||||
table = page[table_number]
|
||||
score_file.write('{0},{1},{2},{3},{4}\n'.format(
|
||||
''.join([page_number, '_', table_number]),
|
||||
table['nrows'],
|
||||
table['ncols'],
|
||||
table['empty_p'],
|
||||
table['score']))
|
||||
|
||||
if args['--debug']:
|
||||
extractor.plot_text()
|
||||
manager.debug_plot()
|
||||
except Exception as e:
|
||||
logging.exception(e.message, exc_info=True)
|
||||
sys.exit()
|
||||
|
||||
if data is None:
|
||||
if args['--debug']:
|
||||
print("See 'camelot <method> -h' for various parameters you can tweak.")
|
||||
else:
|
||||
output = filedir if args['--output'] is None else args['--output']
|
||||
write_to_disk(data, f=args['--format'],
|
||||
output=output, filename=filename)
|
||||
|
||||
vprint("finished in", time.time() - start_time, "seconds")
|
||||
logging.info("Time taken: " + str(time.time() - start_time) + " seconds")
|
||||
|
|
|
|||
Loading…
Reference in New Issue