Add various metrics to score the quality of a parse
Add various metrics to score the quality of a parsepull/2/head
parent
43a009dab4
commit
552f9cf422
|
|
@ -1,18 +1,31 @@
|
||||||
from __future__ import print_function
|
from __future__ import division
|
||||||
import os
|
import os
|
||||||
|
import types
|
||||||
|
import copy_reg
|
||||||
|
import logging
|
||||||
|
|
||||||
import cv2
|
import cv2
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
from wand.image import Image
|
||||||
|
|
||||||
from .table import Table
|
from .table import Table
|
||||||
from .utils import (transform, elements_bbox, detect_vertical, merge_close_values,
|
from .utils import (transform, elements_bbox, detect_vertical, merge_close_values,
|
||||||
get_row_index, get_column_index, reduce_index, outline,
|
get_row_index, get_column_index, get_score, reduce_index,
|
||||||
fill_spanning, remove_empty, encode_list)
|
outline, fill_spanning, count_empty, encode_list, pdf_to_text)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ['Lattice']
|
__all__ = ['Lattice']
|
||||||
|
|
||||||
|
|
||||||
|
def _reduce_method(m):
|
||||||
|
if m.im_self is None:
|
||||||
|
return getattr, (m.im_class, m.im_func.func_name)
|
||||||
|
else:
|
||||||
|
return getattr, (m.im_self, m.im_func.func_name)
|
||||||
|
copy_reg.pickle(types.MethodType, _reduce_method)
|
||||||
|
|
||||||
|
|
||||||
def _morph_transform(imagename, scale=15, invert=False):
|
def _morph_transform(imagename, scale=15, invert=False):
|
||||||
"""Morphological Transformation
|
"""Morphological Transformation
|
||||||
|
|
||||||
|
|
@ -65,8 +78,8 @@ def _morph_transform(imagename, scale=15, invert=False):
|
||||||
vertical = threshold
|
vertical = threshold
|
||||||
horizontal = threshold
|
horizontal = threshold
|
||||||
|
|
||||||
verticalsize = vertical.shape[0] / scale
|
verticalsize = vertical.shape[0] // scale
|
||||||
horizontalsize = horizontal.shape[1] / scale
|
horizontalsize = horizontal.shape[1] // scale
|
||||||
|
|
||||||
ver = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize))
|
ver = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize))
|
||||||
hor = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontalsize, 1))
|
hor = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontalsize, 1))
|
||||||
|
|
@ -79,8 +92,12 @@ def _morph_transform(imagename, scale=15, invert=False):
|
||||||
|
|
||||||
mask = vertical + horizontal
|
mask = vertical + horizontal
|
||||||
joints = np.bitwise_and(vertical, horizontal)
|
joints = np.bitwise_and(vertical, horizontal)
|
||||||
__, contours, __ = cv2.findContours(
|
try:
|
||||||
mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
__, contours, __ = cv2.findContours(
|
||||||
|
mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||||
|
except ValueError:
|
||||||
|
contours, __ = cv2.findContours(
|
||||||
|
mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||||
contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
|
contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
|
||||||
|
|
||||||
tables = {}
|
tables = {}
|
||||||
|
|
@ -88,8 +105,12 @@ def _morph_transform(imagename, scale=15, invert=False):
|
||||||
c_poly = cv2.approxPolyDP(c, 3, True)
|
c_poly = cv2.approxPolyDP(c, 3, True)
|
||||||
x, y, w, h = cv2.boundingRect(c_poly)
|
x, y, w, h = cv2.boundingRect(c_poly)
|
||||||
roi = joints[y : y + h, x : x + w]
|
roi = joints[y : y + h, x : x + w]
|
||||||
__, jc, __ = cv2.findContours(
|
try:
|
||||||
roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
|
__, jc, __ = cv2.findContours(
|
||||||
|
roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
|
||||||
|
except ValueError:
|
||||||
|
jc, __ = cv2.findContours(
|
||||||
|
roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
|
||||||
if len(jc) <= 4: # remove contours with less than <=4 joints
|
if len(jc) <= 4: # remove contours with less than <=4 joints
|
||||||
continue
|
continue
|
||||||
joint_coords = []
|
joint_coords = []
|
||||||
|
|
@ -100,16 +121,24 @@ def _morph_transform(imagename, scale=15, invert=False):
|
||||||
tables[(x, y + h, x + w, y)] = joint_coords
|
tables[(x, y + h, x + w, y)] = joint_coords
|
||||||
|
|
||||||
v_segments, h_segments = [], []
|
v_segments, h_segments = [], []
|
||||||
_, vcontours, _ = cv2.findContours(
|
try:
|
||||||
vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
_, vcontours, _ = cv2.findContours(
|
||||||
|
vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||||
|
except ValueError:
|
||||||
|
vcontours, _ = cv2.findContours(
|
||||||
|
vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||||
for vc in vcontours:
|
for vc in vcontours:
|
||||||
x, y, w, h = cv2.boundingRect(vc)
|
x, y, w, h = cv2.boundingRect(vc)
|
||||||
x1, x2 = x, x + w
|
x1, x2 = x, x + w
|
||||||
y1, y2 = y, y + h
|
y1, y2 = y, y + h
|
||||||
v_segments.append(((x1 + x2) / 2, y2, (x1 + x2) / 2, y1))
|
v_segments.append(((x1 + x2) / 2, y2, (x1 + x2) / 2, y1))
|
||||||
|
|
||||||
_, hcontours, _ = cv2.findContours(
|
try:
|
||||||
horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
_, hcontours, _ = cv2.findContours(
|
||||||
|
horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||||
|
except ValueError:
|
||||||
|
hcontours, _ = cv2.findContours(
|
||||||
|
horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||||
for hc in hcontours:
|
for hc in hcontours:
|
||||||
x, y, w, h = cv2.boundingRect(hc)
|
x, y, w, h = cv2.boundingRect(hc)
|
||||||
x1, x2 = x, x + w
|
x1, x2 = x, x + w
|
||||||
|
|
@ -160,24 +189,19 @@ class Lattice:
|
||||||
page as value.
|
page as value.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, pdfobject, fill=None, scale=15, jtol=2, mtol=2,
|
def __init__(self, fill=None, scale=15, jtol=2, mtol=2,
|
||||||
invert=False, debug=None, verbose=False):
|
invert=False, pdf_margin=(2.0, 0.5, 0.1), debug=None):
|
||||||
|
|
||||||
self.pdfobject = pdfobject
|
self.method = 'lattice'
|
||||||
self.fill = fill
|
self.fill = fill
|
||||||
self.scale = scale
|
self.scale = scale
|
||||||
self.jtol = jtol
|
self.jtol = jtol
|
||||||
self.mtol = mtol
|
self.mtol = mtol
|
||||||
self.invert = invert
|
self.invert = invert
|
||||||
|
self.char_margin, self.line_margin, self.word_margin = pdf_margin
|
||||||
self.debug = debug
|
self.debug = debug
|
||||||
self.verbose = verbose
|
|
||||||
self.tables = {}
|
|
||||||
if self.debug is not None:
|
|
||||||
self.debug_images = {}
|
|
||||||
self.debug_segments = {}
|
|
||||||
self.debug_tables = {}
|
|
||||||
|
|
||||||
def get_tables(self):
|
def get_tables(self, pdfname):
|
||||||
"""Returns all tables found in given pdf.
|
"""Returns all tables found in given pdf.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
|
|
@ -186,169 +210,124 @@ class Lattice:
|
||||||
Dictionary with page number as key and list of tables on that
|
Dictionary with page number as key and list of tables on that
|
||||||
page as value.
|
page as value.
|
||||||
"""
|
"""
|
||||||
vprint = print if self.verbose else lambda *a, **k: None
|
text, __, width, height = pdf_to_text(pdfname, self.char_margin,
|
||||||
self.pdfobject.split()
|
self.line_margin, self.word_margin)
|
||||||
self.pdfobject.convert()
|
bname, __ = os.path.splitext(pdfname)
|
||||||
for page in self.pdfobject.extract():
|
if not text:
|
||||||
p, text, __, width, height = page
|
logging.warning("{0}: PDF has no text. It may be an image.".format(
|
||||||
pkey = 'pg-{0}'.format(p)
|
os.path.basename(bname)))
|
||||||
imagename = os.path.join(
|
return None
|
||||||
self.pdfobject.temp, '{}.png'.format(pkey))
|
imagename = ''.join([bname, '.png'])
|
||||||
pdf_x = width
|
with Image(filename=pdfname, depth=8, resolution=300) as png:
|
||||||
pdf_y = height
|
png.save(filename=imagename)
|
||||||
img, table_bbox, v_segments, h_segments = _morph_transform(
|
pdf_x = width
|
||||||
imagename, scale=self.scale, invert=self.invert)
|
pdf_y = height
|
||||||
img_x = img.shape[1]
|
img, table_bbox, v_segments, h_segments = _morph_transform(
|
||||||
img_y = img.shape[0]
|
imagename, scale=self.scale, invert=self.invert)
|
||||||
scaling_factor_x = pdf_x / float(img_x)
|
img_x = img.shape[1]
|
||||||
scaling_factor_y = pdf_y / float(img_y)
|
img_y = img.shape[0]
|
||||||
|
scaling_factor_x = pdf_x / float(img_x)
|
||||||
|
scaling_factor_y = pdf_y / float(img_y)
|
||||||
|
|
||||||
if self.debug is not None:
|
if self.debug:
|
||||||
self.debug_images[pkey] = (img, table_bbox)
|
self.debug_images = (img, table_bbox)
|
||||||
|
|
||||||
factors = (scaling_factor_x, scaling_factor_y, img_y)
|
factors = (scaling_factor_x, scaling_factor_y, img_y)
|
||||||
table_bbox, v_segments, h_segments = transform(table_bbox, v_segments,
|
table_bbox, v_segments, h_segments = transform(table_bbox, v_segments,
|
||||||
h_segments, factors)
|
h_segments, factors)
|
||||||
|
|
||||||
if self.debug is not None:
|
if self.debug:
|
||||||
self.debug_segments[pkey] = (v_segments, h_segments)
|
self.debug_segments = (v_segments, h_segments)
|
||||||
|
self.debug_tables = []
|
||||||
|
|
||||||
if self.debug is not None:
|
pdf_page = {}
|
||||||
debug_page_tables = []
|
page_tables = {}
|
||||||
page_tables = []
|
table_no = 1
|
||||||
# sort tables based on y-coord
|
# sort tables based on y-coord
|
||||||
for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True):
|
for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True):
|
||||||
# select edges which lie within table_bbox
|
# select edges which lie within table_bbox
|
||||||
text_bbox, v_s, h_s = elements_bbox(k, text, v_segments,
|
table_info = {}
|
||||||
h_segments)
|
text_bbox, v_s, h_s = elements_bbox(k, text, v_segments,
|
||||||
rotated = detect_vertical(text_bbox)
|
h_segments)
|
||||||
cols, rows = zip(*table_bbox[k])
|
table_info['text_p'] = 100 * (1 - (len(text_bbox) / len(text)))
|
||||||
cols, rows = list(cols), list(rows)
|
rotated = detect_vertical(text_bbox)
|
||||||
cols.extend([k[0], k[2]])
|
cols, rows = zip(*table_bbox[k])
|
||||||
rows.extend([k[1], k[3]])
|
cols, rows = list(cols), list(rows)
|
||||||
# sort horizontal and vertical segments
|
cols.extend([k[0], k[2]])
|
||||||
cols = merge_close_values(sorted(cols), mtol=self.mtol)
|
rows.extend([k[1], k[3]])
|
||||||
rows = merge_close_values(
|
# sort horizontal and vertical segments
|
||||||
sorted(rows, reverse=True), mtol=self.mtol)
|
cols = merge_close_values(sorted(cols), mtol=self.mtol)
|
||||||
# make grid using x and y coord of shortlisted rows and cols
|
rows = merge_close_values(
|
||||||
cols = [(cols[i], cols[i + 1])
|
sorted(rows, reverse=True), mtol=self.mtol)
|
||||||
for i in range(0, len(cols) - 1)]
|
# make grid using x and y coord of shortlisted rows and cols
|
||||||
rows = [(rows[i], rows[i + 1])
|
cols = [(cols[i], cols[i + 1])
|
||||||
for i in range(0, len(rows) - 1)]
|
for i in range(0, len(cols) - 1)]
|
||||||
table = Table(cols, rows)
|
rows = [(rows[i], rows[i + 1])
|
||||||
# set table edges to True using ver+hor lines
|
for i in range(0, len(rows) - 1)]
|
||||||
table = table.set_edges(v_s, h_s, jtol=self.jtol)
|
table = Table(cols, rows)
|
||||||
# set spanning cells to True
|
# set table edges to True using ver+hor lines
|
||||||
table = table.set_spanning()
|
table = table.set_edges(v_s, h_s, jtol=self.jtol)
|
||||||
# set table border edges to True
|
nouse = table.nocont_ / (len(v_s) + len(h_s))
|
||||||
table = outline(table)
|
table_info['line_p'] = 100 * (1 - nouse)
|
||||||
|
# set spanning cells to True
|
||||||
|
table = table.set_spanning()
|
||||||
|
# set table border edges to True
|
||||||
|
table = outline(table)
|
||||||
|
|
||||||
if self.debug is not None:
|
if self.debug:
|
||||||
debug_page_tables.append(table)
|
self.debug_tables.append(table)
|
||||||
|
|
||||||
# fill text after sorting it
|
# fill text after sorting it
|
||||||
if rotated == '':
|
if rotated == '':
|
||||||
text_bbox.sort(key=lambda x: (-x.y0, x.x0))
|
text_bbox.sort(key=lambda x: (-x.y0, x.x0))
|
||||||
elif rotated == 'left':
|
elif rotated == 'left':
|
||||||
text_bbox.sort(key=lambda x: (x.x0, x.y0))
|
text_bbox.sort(key=lambda x: (x.x0, x.y0))
|
||||||
elif rotated == 'right':
|
elif rotated == 'right':
|
||||||
text_bbox.sort(key=lambda x: (-x.x0, -x.y0))
|
text_bbox.sort(key=lambda x: (-x.x0, -x.y0))
|
||||||
for t in text_bbox:
|
|
||||||
r_idx = get_row_index(t, rows)
|
|
||||||
c_idx = get_column_index(t, cols)
|
|
||||||
if None in [r_idx, c_idx]:
|
|
||||||
# couldn't assign LTChar to any cell
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
r_idx, c_idx = reduce_index(
|
|
||||||
table, rotated, r_idx, c_idx)
|
|
||||||
table.cells[r_idx][c_idx].add_text(
|
|
||||||
t.get_text().strip('\n'))
|
|
||||||
|
|
||||||
if self.fill is not None:
|
rerror = []
|
||||||
table = fill_spanning(table, fill=self.fill)
|
cerror = []
|
||||||
ar = table.get_list()
|
for t in text_bbox:
|
||||||
if rotated == 'left':
|
try:
|
||||||
ar = zip(*ar[::-1])
|
r_idx, rass_error = get_row_index(t, rows)
|
||||||
elif rotated == 'right':
|
except TypeError:
|
||||||
ar = zip(*ar[::1])
|
# couldn't assign LTChar to any cell
|
||||||
ar.reverse()
|
continue
|
||||||
ar = remove_empty(ar)
|
try:
|
||||||
ar = [list(o) for o in ar]
|
c_idx, cass_error = get_column_index(t, cols)
|
||||||
page_tables.append(encode_list(ar))
|
except TypeError:
|
||||||
vprint(pkey)
|
# couldn't assign LTChar to any cell
|
||||||
self.tables[pkey] = page_tables
|
continue
|
||||||
|
rerror.append(rass_error)
|
||||||
|
cerror.append(cass_error)
|
||||||
|
r_idx, c_idx = reduce_index(
|
||||||
|
table, rotated, r_idx, c_idx)
|
||||||
|
table.cells[r_idx][c_idx].add_text(
|
||||||
|
t.get_text().strip('\n'))
|
||||||
|
score = get_score([[50, rerror], [50, cerror]])
|
||||||
|
table_info['score'] = score
|
||||||
|
|
||||||
if self.debug is not None:
|
if self.fill is not None:
|
||||||
self.debug_tables[pkey] = debug_page_tables
|
table = fill_spanning(table, fill=self.fill)
|
||||||
|
ar = table.get_list()
|
||||||
|
if rotated == 'left':
|
||||||
|
ar = zip(*ar[::-1])
|
||||||
|
elif rotated == 'right':
|
||||||
|
ar = zip(*ar[::1])
|
||||||
|
ar.reverse()
|
||||||
|
ar = encode_list(ar)
|
||||||
|
table_info['data'] = ar
|
||||||
|
empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)
|
||||||
|
table_info['empty_p'] = empty_p
|
||||||
|
table_info['r_nempty_cells'] = r_nempty_cells
|
||||||
|
table_info['c_nempty_cells'] = c_nempty_cells
|
||||||
|
table_info['nrows'] = len(ar)
|
||||||
|
table_info['ncols'] = len(ar[0])
|
||||||
|
page_tables['table_{0}'.format(table_no)] = table_info
|
||||||
|
table_no += 1
|
||||||
|
pdf_page[os.path.basename(bname)] = page_tables
|
||||||
|
|
||||||
if self.pdfobject.clean:
|
if self.debug:
|
||||||
self.pdfobject.remove_tempdir()
|
|
||||||
|
|
||||||
if self.debug is not None:
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
return self.tables
|
return pdf_page
|
||||||
|
|
||||||
def plot_geometry(self, geometry):
|
|
||||||
"""Plots various pdf geometries that are detected so user can choose
|
|
||||||
tweak scale, jtol, mtol parameters.
|
|
||||||
"""
|
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
|
|
||||||
if geometry == 'contour':
|
|
||||||
for pkey in self.debug_images.keys():
|
|
||||||
img, table_bbox = self.debug_images[pkey]
|
|
||||||
for t in table_bbox.keys():
|
|
||||||
cv2.rectangle(img, (t[0], t[1]),
|
|
||||||
(t[2], t[3]), (255, 0, 0), 3)
|
|
||||||
plt.imshow(img)
|
|
||||||
plt.show()
|
|
||||||
elif geometry == 'joint':
|
|
||||||
x_coord = []
|
|
||||||
y_coord = []
|
|
||||||
for pkey in self.debug_images.keys():
|
|
||||||
img, table_bbox = self.debug_images[pkey]
|
|
||||||
for k in table_bbox.keys():
|
|
||||||
for coord in table_bbox[k]:
|
|
||||||
x_coord.append(coord[0])
|
|
||||||
y_coord.append(coord[1])
|
|
||||||
max_x, max_y = max(x_coord), max(y_coord)
|
|
||||||
plt.plot(x_coord, y_coord, 'ro')
|
|
||||||
plt.axis([0, max_x + 100, max_y + 100, 0])
|
|
||||||
plt.imshow(img)
|
|
||||||
plt.show()
|
|
||||||
elif geometry == 'line':
|
|
||||||
for pkey in self.debug_segments.keys():
|
|
||||||
v_s, h_s = self.debug_segments[pkey]
|
|
||||||
for v in v_s:
|
|
||||||
plt.plot([v[0], v[2]], [v[1], v[3]])
|
|
||||||
for h in h_s:
|
|
||||||
plt.plot([h[0], h[2]], [h[1], h[3]])
|
|
||||||
plt.show()
|
|
||||||
elif geometry == 'table':
|
|
||||||
for pkey in self.debug_tables.keys():
|
|
||||||
for table in self.debug_tables[pkey]:
|
|
||||||
for i in range(len(table.cells)):
|
|
||||||
for j in range(len(table.cells[i])):
|
|
||||||
if table.cells[i][j].left:
|
|
||||||
plt.plot([table.cells[i][j].lb[0],
|
|
||||||
table.cells[i][j].lt[0]],
|
|
||||||
[table.cells[i][j].lb[1],
|
|
||||||
table.cells[i][j].lt[1]])
|
|
||||||
if table.cells[i][j].right:
|
|
||||||
plt.plot([table.cells[i][j].rb[0],
|
|
||||||
table.cells[i][j].rt[0]],
|
|
||||||
[table.cells[i][j].rb[1],
|
|
||||||
table.cells[i][j].rt[1]])
|
|
||||||
if table.cells[i][j].top:
|
|
||||||
plt.plot([table.cells[i][j].lt[0],
|
|
||||||
table.cells[i][j].rt[0]],
|
|
||||||
[table.cells[i][j].lt[1],
|
|
||||||
table.cells[i][j].rt[1]])
|
|
||||||
if table.cells[i][j].bottom:
|
|
||||||
plt.plot([table.cells[i][j].lb[0],
|
|
||||||
table.cells[i][j].rb[0]],
|
|
||||||
[table.cells[i][j].lb[1],
|
|
||||||
table.cells[i][j].rb[1]])
|
|
||||||
plt.show()
|
|
||||||
224
camelot/pdf.py
224
camelot/pdf.py
|
|
@ -1,18 +1,11 @@
|
||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
import tempfile
|
import tempfile
|
||||||
|
import itertools
|
||||||
|
import multiprocessing as mp
|
||||||
|
|
||||||
|
import cv2
|
||||||
from PyPDF2 import PdfFileReader, PdfFileWriter
|
from PyPDF2 import PdfFileReader, PdfFileWriter
|
||||||
from pdfminer.pdfparser import PDFParser
|
|
||||||
from pdfminer.pdfdocument import PDFDocument
|
|
||||||
from pdfminer.pdfpage import PDFPage
|
|
||||||
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
|
|
||||||
from pdfminer.pdfinterp import PDFResourceManager
|
|
||||||
from pdfminer.pdfinterp import PDFPageInterpreter
|
|
||||||
from pdfminer.pdfdevice import PDFDevice
|
|
||||||
from pdfminer.converter import PDFPageAggregator
|
|
||||||
from pdfminer.layout import LAParams, LTChar, LTTextLineHorizontal
|
|
||||||
from wand.image import Image
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ['Pdf']
|
__all__ = ['Pdf']
|
||||||
|
|
@ -38,38 +31,6 @@ def _parse_page_numbers(pagenos):
|
||||||
return page_numbers
|
return page_numbers
|
||||||
|
|
||||||
|
|
||||||
def _extract_text_objects(layout, LTObject, t=None):
|
|
||||||
"""Recursively parses pdf layout to get a list of
|
|
||||||
text objects.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
layout : object
|
|
||||||
Layout object.
|
|
||||||
|
|
||||||
LTObject : object
|
|
||||||
Text object, either LTChar or LTTextLineHorizontal.
|
|
||||||
|
|
||||||
t : list (optional, default: None)
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
t : list
|
|
||||||
List of text objects.
|
|
||||||
"""
|
|
||||||
if t is None:
|
|
||||||
t = []
|
|
||||||
try:
|
|
||||||
for obj in layout._objs:
|
|
||||||
if isinstance(obj, LTObject):
|
|
||||||
t.append(obj)
|
|
||||||
else:
|
|
||||||
t += _extract_text_objects(obj, LTObject)
|
|
||||||
except AttributeError:
|
|
||||||
pass
|
|
||||||
return t
|
|
||||||
|
|
||||||
|
|
||||||
class Pdf:
|
class Pdf:
|
||||||
"""Handles all pdf operations which include:
|
"""Handles all pdf operations which include:
|
||||||
|
|
||||||
|
|
@ -99,66 +60,163 @@ class Pdf:
|
||||||
is greater than word_margin. (optional, default: 0.1)
|
is greater than word_margin. (optional, default: 0.1)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, pdfname, pagenos=[{'start': 1, 'end': 1}],
|
def __init__(self, extractor, pdfname, pagenos=[{'start': 1, 'end': 1}],
|
||||||
char_margin=2.0, line_margin=0.5, word_margin=0.1,
|
parallel=False, clean=False):
|
||||||
clean=False):
|
|
||||||
|
|
||||||
|
self.extractor = extractor
|
||||||
self.pdfname = pdfname
|
self.pdfname = pdfname
|
||||||
|
if not self.pdfname.endswith('.pdf'):
|
||||||
|
raise TypeError("Only PDF format is supported right now.")
|
||||||
self.pagenos = _parse_page_numbers(pagenos)
|
self.pagenos = _parse_page_numbers(pagenos)
|
||||||
self.char_margin = char_margin
|
self.parallel = parallel
|
||||||
self.line_margin = line_margin
|
self.cpu_count = mp.cpu_count()
|
||||||
self.word_margin = word_margin
|
self.pool = mp.Pool(processes=self.cpu_count)
|
||||||
self.clean = clean
|
self.clean = clean
|
||||||
self.temp = tempfile.mkdtemp()
|
self.temp = tempfile.mkdtemp()
|
||||||
|
|
||||||
def split(self):
|
def split(self):
|
||||||
"""Splits pdf into single page pdfs.
|
"""Splits pdf into single page pdfs.
|
||||||
"""
|
"""
|
||||||
if not self.pdfname.endswith('.pdf'):
|
|
||||||
raise TypeError("Only PDF format is supported.")
|
|
||||||
infile = PdfFileReader(open(self.pdfname, 'rb'), strict=False)
|
infile = PdfFileReader(open(self.pdfname, 'rb'), strict=False)
|
||||||
for p in self.pagenos:
|
for p in self.pagenos:
|
||||||
page = infile.getPage(p - 1)
|
page = infile.getPage(p - 1)
|
||||||
outfile = PdfFileWriter()
|
outfile = PdfFileWriter()
|
||||||
outfile.addPage(page)
|
outfile.addPage(page)
|
||||||
with open(os.path.join(self.temp, 'pg-{0}.pdf'.format(p)), 'wb') as f:
|
with open(os.path.join(self.temp, 'page-{0}.pdf'.format(p)), 'wb') as f:
|
||||||
outfile.write(f)
|
outfile.write(f)
|
||||||
|
|
||||||
|
def remove_tempdir(self):
|
||||||
|
shutil.rmtree(self.temp)
|
||||||
|
|
||||||
def extract(self):
|
def extract(self):
|
||||||
"""Extracts text objects, width, height from a pdf.
|
"""Extracts text objects, width, height from a pdf.
|
||||||
"""
|
"""
|
||||||
for p in self.pagenos:
|
self.split()
|
||||||
pkey = 'pg-{0}'.format(p)
|
pages = [os.path.join(self.temp, 'page-{0}.pdf'.format(p))
|
||||||
pname = os.path.join(self.temp, '{}.pdf'.format(pkey))
|
for p in self.pagenos]
|
||||||
with open(pname, 'r') as f:
|
if self.parallel:
|
||||||
parser = PDFParser(f)
|
tables = self.pool.map(self.extractor.get_tables, pages)
|
||||||
document = PDFDocument(parser)
|
tables = {k: v for d in tables if d is not None for k, v in d.items()}
|
||||||
if not document.is_extractable:
|
else:
|
||||||
raise PDFTextExtractionNotAllowed
|
tables = {}
|
||||||
laparams = LAParams(char_margin=self.char_margin,
|
if self.extractor.debug:
|
||||||
line_margin=self.line_margin,
|
if self.extractor.method == 'stream':
|
||||||
word_margin=self.word_margin)
|
self.debug = self.extractor.debug
|
||||||
rsrcmgr = PDFResourceManager()
|
self.debug_text = []
|
||||||
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
|
elif self.extractor.method == 'lattice':
|
||||||
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
self.debug = self.extractor.debug
|
||||||
for page in PDFPage.create_pages(document):
|
self.debug_images = []
|
||||||
interpreter.process_page(page)
|
self.debug_segments = []
|
||||||
layout = device.get_result()
|
self.debug_tables = []
|
||||||
lattice_objects = _extract_text_objects(layout, LTChar)
|
for p in pages:
|
||||||
stream_objects = _extract_text_objects(
|
table = self.extractor.get_tables(p)
|
||||||
layout, LTTextLineHorizontal)
|
if table is not None:
|
||||||
width = layout.bbox[2]
|
tables.update(table)
|
||||||
height = layout.bbox[3]
|
if self.extractor.debug:
|
||||||
yield p, lattice_objects, stream_objects, width, height
|
if self.extractor.method == 'stream':
|
||||||
|
self.debug_text.append(self.extractor.debug_text)
|
||||||
|
elif self.extractor.method == 'lattice':
|
||||||
|
self.debug_images.append(self.extractor.debug_images)
|
||||||
|
self.debug_segments.append(self.extractor.debug_segments)
|
||||||
|
self.debug_tables.append(self.extractor.debug_tables)
|
||||||
|
if self.clean:
|
||||||
|
self.remove_tempdir()
|
||||||
|
return tables
|
||||||
|
|
||||||
def convert(self):
|
def debug_plot(self):
|
||||||
"""Converts single page pdfs to images.
|
"""Plots all text objects and various pdf geometries so that
|
||||||
|
user can choose number of columns, columns x-coordinates for
|
||||||
|
Stream or tweak Lattice parameters (scale, jtol, mtol).
|
||||||
"""
|
"""
|
||||||
for p in self.pagenos:
|
import matplotlib.pyplot as plt
|
||||||
pdfname = os.path.join(self.temp, 'pg-{0}.pdf'.format(p))
|
import matplotlib.patches as patches
|
||||||
imagename = os.path.join(self.temp, 'pg-{0}.png'.format(p))
|
|
||||||
with Image(filename=pdfname, depth=8, resolution=300) as png:
|
|
||||||
png.save(filename=imagename)
|
|
||||||
|
|
||||||
def remove_tempdir(self):
|
if self.debug is True:
|
||||||
shutil.rmtree(self.temp)
|
try:
|
||||||
|
for text in self.debug_text:
|
||||||
|
fig = plt.figure()
|
||||||
|
ax = fig.add_subplot(111, aspect='equal')
|
||||||
|
xs, ys = [], []
|
||||||
|
for t in text:
|
||||||
|
xs.extend([t[0], t[1]])
|
||||||
|
ys.extend([t[2], t[3]])
|
||||||
|
ax.add_patch(
|
||||||
|
patches.Rectangle(
|
||||||
|
(t[0], t[1]),
|
||||||
|
t[2] - t[0],
|
||||||
|
t[3] - t[1]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
ax.set_xlim(min(xs) - 10, max(xs) + 10)
|
||||||
|
ax.set_ylim(min(ys) - 10, max(ys) + 10)
|
||||||
|
plt.show()
|
||||||
|
except AttributeError:
|
||||||
|
raise ValueError("This option only be used with Stream.")
|
||||||
|
elif self.debug == 'contour':
|
||||||
|
try:
|
||||||
|
for img, table_bbox in self.debug_images:
|
||||||
|
for t in table_bbox.keys():
|
||||||
|
cv2.rectangle(img, (t[0], t[1]),
|
||||||
|
(t[2], t[3]), (255, 0, 0), 3)
|
||||||
|
plt.imshow(img)
|
||||||
|
plt.show()
|
||||||
|
except AttributeError:
|
||||||
|
raise ValueError("This option only be used with Lattice.")
|
||||||
|
elif self.debug == 'joint':
|
||||||
|
try:
|
||||||
|
for img, table_bbox in self.debug_images:
|
||||||
|
x_coord = []
|
||||||
|
y_coord = []
|
||||||
|
for k in table_bbox.keys():
|
||||||
|
for coord in table_bbox[k]:
|
||||||
|
x_coord.append(coord[0])
|
||||||
|
y_coord.append(coord[1])
|
||||||
|
max_x, max_y = max(x_coord), max(y_coord)
|
||||||
|
plt.plot(x_coord, y_coord, 'ro')
|
||||||
|
plt.axis([0, max_x + 100, max_y + 100, 0])
|
||||||
|
plt.imshow(img)
|
||||||
|
plt.show()
|
||||||
|
except AttributeError:
|
||||||
|
raise ValueError("This option only be used with Lattice.")
|
||||||
|
elif self.debug == 'line':
|
||||||
|
try:
|
||||||
|
for v_s, h_s in self.debug_segments:
|
||||||
|
for v in v_s:
|
||||||
|
plt.plot([v[0], v[2]], [v[1], v[3]])
|
||||||
|
for h in h_s:
|
||||||
|
plt.plot([h[0], h[2]], [h[1], h[3]])
|
||||||
|
plt.show()
|
||||||
|
except AttributeError:
|
||||||
|
raise ValueError("This option only be used with Lattice.")
|
||||||
|
elif self.debug == 'table':
|
||||||
|
try:
|
||||||
|
for tables in self.debug_tables:
|
||||||
|
for table in tables:
|
||||||
|
for i in range(len(table.cells)):
|
||||||
|
for j in range(len(table.cells[i])):
|
||||||
|
if table.cells[i][j].left:
|
||||||
|
plt.plot([table.cells[i][j].lb[0],
|
||||||
|
table.cells[i][j].lt[0]],
|
||||||
|
[table.cells[i][j].lb[1],
|
||||||
|
table.cells[i][j].lt[1]])
|
||||||
|
if table.cells[i][j].right:
|
||||||
|
plt.plot([table.cells[i][j].rb[0],
|
||||||
|
table.cells[i][j].rt[0]],
|
||||||
|
[table.cells[i][j].rb[1],
|
||||||
|
table.cells[i][j].rt[1]])
|
||||||
|
if table.cells[i][j].top:
|
||||||
|
plt.plot([table.cells[i][j].lt[0],
|
||||||
|
table.cells[i][j].rt[0]],
|
||||||
|
[table.cells[i][j].lt[1],
|
||||||
|
table.cells[i][j].rt[1]])
|
||||||
|
if table.cells[i][j].bottom:
|
||||||
|
plt.plot([table.cells[i][j].lb[0],
|
||||||
|
table.cells[i][j].rb[0]],
|
||||||
|
[table.cells[i][j].lb[1],
|
||||||
|
table.cells[i][j].rb[1]])
|
||||||
|
plt.show()
|
||||||
|
except AttributeError:
|
||||||
|
raise ValueError("This option only be used with Lattice.")
|
||||||
|
else:
|
||||||
|
raise UserWarning("This method can only be called after"
|
||||||
|
" debug has been specified.")
|
||||||
|
|
@ -1,14 +1,26 @@
|
||||||
from __future__ import print_function
|
from __future__ import division
|
||||||
import os
|
import os
|
||||||
|
import types
|
||||||
|
import copy_reg
|
||||||
|
import logging
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from .utils import get_column_index, encode_list
|
from .table import Table
|
||||||
|
from .utils import get_row_index, get_score, count_empty, encode_list, pdf_to_text
|
||||||
|
|
||||||
|
|
||||||
__all__ = ['Stream']
|
__all__ = ['Stream']
|
||||||
|
|
||||||
|
|
||||||
|
def _reduce_method(m):
|
||||||
|
if m.im_self is None:
|
||||||
|
return getattr, (m.im_class, m.im_func.func_name)
|
||||||
|
else:
|
||||||
|
return getattr, (m.im_self, m.im_func.func_name)
|
||||||
|
copy_reg.pickle(types.MethodType, _reduce_method)
|
||||||
|
|
||||||
|
|
||||||
def _group_rows(text, ytol=2):
|
def _group_rows(text, ytol=2):
|
||||||
"""Groups text objects into rows using ytol.
|
"""Groups text objects into rows using ytol.
|
||||||
|
|
||||||
|
|
@ -35,14 +47,16 @@ def _group_rows(text, ytol=2):
|
||||||
# type(obj) is LTChar]):
|
# type(obj) is LTChar]):
|
||||||
if t.get_text().strip():
|
if t.get_text().strip():
|
||||||
if not np.isclose(row_y, t.y0, atol=ytol):
|
if not np.isclose(row_y, t.y0, atol=ytol):
|
||||||
row_y = t.y0
|
rows.append(sorted(temp, key=lambda t: t.x0))
|
||||||
rows.append(temp)
|
|
||||||
temp = []
|
temp = []
|
||||||
|
row_y = t.y0
|
||||||
temp.append(t)
|
temp.append(t)
|
||||||
|
rows.append(sorted(temp, key=lambda t: t.x0))
|
||||||
|
__ = rows.pop(0) # hacky
|
||||||
return rows
|
return rows
|
||||||
|
|
||||||
|
|
||||||
def _merge_columns(l):
|
def _merge_columns(l, mtol=2):
|
||||||
"""Merges overlapping columns and returns list with updated
|
"""Merges overlapping columns and returns list with updated
|
||||||
columns boundaries.
|
columns boundaries.
|
||||||
|
|
||||||
|
|
@ -62,7 +76,8 @@ def _merge_columns(l):
|
||||||
merged.append(higher)
|
merged.append(higher)
|
||||||
else:
|
else:
|
||||||
lower = merged[-1]
|
lower = merged[-1]
|
||||||
if higher[0] <= lower[1]:
|
if (higher[0] <= lower[1] or
|
||||||
|
np.isclose(higher[0], lower[1], atol=mtol)):
|
||||||
upper_bound = max(lower[1], higher[1])
|
upper_bound = max(lower[1], higher[1])
|
||||||
lower_bound = min(lower[0], higher[0])
|
lower_bound = min(lower[0], higher[0])
|
||||||
merged[-1] = (lower_bound, upper_bound)
|
merged[-1] = (lower_bound, upper_bound)
|
||||||
|
|
@ -71,6 +86,62 @@ def _merge_columns(l):
|
||||||
return merged
|
return merged
|
||||||
|
|
||||||
|
|
||||||
|
def _get_column_index(t, columns):
|
||||||
|
"""Gets index of the column in which the given object falls by
|
||||||
|
comparing their co-ordinates.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
t : object
|
||||||
|
|
||||||
|
columns : list
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
c : int
|
||||||
|
"""
|
||||||
|
offset1, offset2 = 0, 0
|
||||||
|
lt_col_overlap = []
|
||||||
|
for c in columns:
|
||||||
|
if c[0] <= t.x1 and c[1] >= t.x0:
|
||||||
|
left = t.x0 if c[0] <= t.x0 else c[0]
|
||||||
|
right = t.x1 if c[1] >= t.x1 else c[1]
|
||||||
|
lt_col_overlap.append(abs(left - right) / abs(c[0] - c[1]))
|
||||||
|
else:
|
||||||
|
lt_col_overlap.append(-1)
|
||||||
|
if len(filter(lambda x: x != -1, lt_col_overlap)) == 0:
|
||||||
|
logging.warning("Text doesn't fit any column.")
|
||||||
|
c_idx = lt_col_overlap.index(max(lt_col_overlap))
|
||||||
|
if t.x0 < columns[c_idx][0]:
|
||||||
|
offset1 = abs(t.x0 - columns[c_idx][0])
|
||||||
|
if t.x1 > columns[c_idx][1]:
|
||||||
|
offset2 = abs(t.x1 - columns[c_idx][1])
|
||||||
|
Y = abs(t.y0 - t.y1)
|
||||||
|
charea = abs(t.x0 - t.x1) * abs(t.y0 - t.y1)
|
||||||
|
error = (Y * (offset1 + offset2)) / charea
|
||||||
|
return c_idx, error
|
||||||
|
|
||||||
|
|
||||||
|
def _add_columns(cols, text, ytolerance):
|
||||||
|
if text:
|
||||||
|
text = _group_rows(text, ytol=ytolerance)
|
||||||
|
elements = [len(r) for r in text]
|
||||||
|
new_cols = [(t.x0, t.x1)
|
||||||
|
for r in text if len(r) == max(elements) for t in r]
|
||||||
|
cols.extend(_merge_columns(sorted(new_cols)))
|
||||||
|
return cols
|
||||||
|
|
||||||
|
|
||||||
|
def _join_columns(cols, width):
|
||||||
|
cols = sorted(cols)
|
||||||
|
cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))]
|
||||||
|
cols.insert(0, 0)
|
||||||
|
cols.append(width) # or some tolerance
|
||||||
|
cols = [(cols[i], cols[i + 1])
|
||||||
|
for i in range(0, len(cols) - 1)]
|
||||||
|
return cols
|
||||||
|
|
||||||
|
|
||||||
class Stream:
|
class Stream:
|
||||||
"""Stream algorithm
|
"""Stream algorithm
|
||||||
|
|
||||||
|
|
@ -105,20 +176,18 @@ class Stream:
|
||||||
page as value.
|
page as value.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, pdfobject, ncolumns=0, columns=None, ytol=2,
|
def __init__(self, ncolumns=0, columns=None, ytol=2, mtol=2,
|
||||||
debug=False, verbose=False):
|
pdf_margin=(2.0, 0.5, 0.1), debug=False):
|
||||||
|
|
||||||
self.pdfobject = pdfobject
|
self.method = 'stream'
|
||||||
self.ncolumns = ncolumns
|
self.ncolumns = ncolumns
|
||||||
self.columns = columns
|
self.columns = columns
|
||||||
self.ytol = ytol
|
self.ytol = ytol
|
||||||
|
self.mtol = mtol
|
||||||
|
self.char_margin, self.line_margin, self.word_margin = pdf_margin
|
||||||
self.debug = debug
|
self.debug = debug
|
||||||
self.verbose = verbose
|
|
||||||
self.tables = {}
|
|
||||||
if self.debug:
|
|
||||||
self.debug_text = {}
|
|
||||||
|
|
||||||
def get_tables(self):
|
def get_tables(self, pdfname):
|
||||||
"""Returns all tables found in given pdf.
|
"""Returns all tables found in given pdf.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
|
|
@ -127,86 +196,112 @@ class Stream:
|
||||||
Dictionary with page number as key and list of tables on that
|
Dictionary with page number as key and list of tables on that
|
||||||
page as value.
|
page as value.
|
||||||
"""
|
"""
|
||||||
vprint = print if self.verbose else lambda *a, **k: None
|
__, text, width, height = pdf_to_text(pdfname, self.char_margin,
|
||||||
self.pdfobject.split()
|
self.line_margin, self.word_margin)
|
||||||
for page in self.pdfobject.extract():
|
bname, __ = os.path.splitext(pdfname)
|
||||||
p, __, text, __, __ = page
|
if not text:
|
||||||
pkey = 'pg-{0}'.format(p)
|
logging.warning("{0}: PDF has no text. It may be an image.".format(
|
||||||
text.sort(key=lambda x: (-x.y0, x.x0))
|
os.path.basename(bname)))
|
||||||
|
return None
|
||||||
if self.debug:
|
text.sort(key=lambda x: (-x.y0, x.x0))
|
||||||
self.debug_text[pkey] = text
|
|
||||||
|
|
||||||
rows = _group_rows(text, ytol=self.ytol)
|
|
||||||
elements = [len(r) for r in rows]
|
|
||||||
# a table can't have just 1 column, can it?
|
|
||||||
elements = filter(lambda x: x != 1, elements)
|
|
||||||
|
|
||||||
guess = False
|
|
||||||
if self.columns:
|
|
||||||
cols = self.columns.split(',')
|
|
||||||
cols = [(float(cols[i]), float(cols[i + 1]))
|
|
||||||
for i in range(0, len(cols) - 1)]
|
|
||||||
else:
|
|
||||||
guess = True
|
|
||||||
ncols = self.ncolumns if self.ncolumns else max(
|
|
||||||
set(elements), key=elements.count)
|
|
||||||
if ncols == 0:
|
|
||||||
# no tables detected
|
|
||||||
continue
|
|
||||||
cols = [(t.x0, t.x1)
|
|
||||||
for r in rows for t in r if len(r) == ncols]
|
|
||||||
cols = _merge_columns(sorted(cols))
|
|
||||||
cols = [(c[0] + c[1]) / 2.0 for c in cols]
|
|
||||||
|
|
||||||
ar = [['' for c in cols] for r in rows]
|
|
||||||
for r_idx, r in enumerate(rows):
|
|
||||||
for t in r:
|
|
||||||
if guess:
|
|
||||||
cog = (t.x0 + t.x1) / 2.0
|
|
||||||
diff = [abs(cog - c) for c in cols]
|
|
||||||
c_idx = diff.index(min(diff))
|
|
||||||
else:
|
|
||||||
c_idx = get_column_index(t, cols)
|
|
||||||
if None in [r_idx, c_idx]: # couldn't assign LTTextLH to any cell
|
|
||||||
continue
|
|
||||||
if ar[r_idx][c_idx]:
|
|
||||||
ar[r_idx][c_idx] = ' '.join(
|
|
||||||
[ar[r_idx][c_idx], t.get_text().strip()])
|
|
||||||
else:
|
|
||||||
ar[r_idx][c_idx] = t.get_text().strip()
|
|
||||||
vprint(pkey)
|
|
||||||
self.tables[pkey] = [encode_list(ar)]
|
|
||||||
|
|
||||||
if self.pdfobject.clean:
|
|
||||||
self.pdfobject.remove_tempdir()
|
|
||||||
|
|
||||||
if self.debug:
|
if self.debug:
|
||||||
|
self.debug_text = [(t.x0, t.y0, t.x1, t.y1) for t in text]
|
||||||
return None
|
return None
|
||||||
|
|
||||||
return self.tables
|
rows_grouped = _group_rows(text, ytol=self.ytol)
|
||||||
|
elements = [len(r) for r in rows_grouped]
|
||||||
|
row_mids = [sum([(t.y0 + t.y1) / 2 for t in r]) / len(r)
|
||||||
|
if len(r) > 0 else 0 for r in rows_grouped]
|
||||||
|
rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))]
|
||||||
|
rows.insert(0, height) # or some tolerance
|
||||||
|
rows.append(0)
|
||||||
|
rows = [(rows[i], rows[i + 1])
|
||||||
|
for i in range(0, len(rows) - 1)]
|
||||||
|
|
||||||
def plot_text(self):
|
guess = False
|
||||||
"""Plots all text objects so user can choose number of columns
|
if self.columns:
|
||||||
or columns x-coordinates using the matplotlib interface.
|
# user has to input boundary columns too
|
||||||
"""
|
# take (0, width) by default
|
||||||
import matplotlib.pyplot as plt
|
# similar to else condition
|
||||||
import matplotlib.patches as patches
|
# len can't be 1
|
||||||
|
cols = self.columns.split(',')
|
||||||
|
cols = [(float(cols[i]), float(cols[i + 1]))
|
||||||
|
for i in range(0, len(cols) - 1)]
|
||||||
|
else:
|
||||||
|
if self.ncolumns:
|
||||||
|
ncols = self.ncolumns
|
||||||
|
cols = [(t.x0, t.x1)
|
||||||
|
for r in rows_grouped if len(r) == ncols for t in r]
|
||||||
|
cols = _merge_columns(sorted(cols), mtol=self.mtol)
|
||||||
|
if len(cols) != self.ncolumns:
|
||||||
|
logging.warning("{}: The number of columns after merge"
|
||||||
|
" isn't the same as what you specified."
|
||||||
|
" Change the value of mtol.".format(
|
||||||
|
os.path.basename(bname)))
|
||||||
|
cols = _join_columns(cols, width)
|
||||||
|
else:
|
||||||
|
guess = True
|
||||||
|
ncols = max(set(elements), key=elements.count)
|
||||||
|
len_non_mode = len(filter(lambda x: x != ncols, elements))
|
||||||
|
if ncols == 1 and not self.debug:
|
||||||
|
# no tables detected
|
||||||
|
logging.warning("{}: Only one column was detected, the PDF"
|
||||||
|
" may have no tables. Specify ncols if"
|
||||||
|
" the PDF has tables.".format(
|
||||||
|
os.path.basename(bname)))
|
||||||
|
cols = [(t.x0, t.x1)
|
||||||
|
for r in rows_grouped if len(r) == ncols for t in r]
|
||||||
|
cols = _merge_columns(sorted(cols), mtol=self.mtol)
|
||||||
|
inner_text = []
|
||||||
|
for i in range(1, len(cols)):
|
||||||
|
left = cols[i - 1][1]
|
||||||
|
right = cols[i][0]
|
||||||
|
inner_text.extend([t for t in text if t.x0 > left and t.x1 < right])
|
||||||
|
outer_text = [t for t in text if t.x0 > cols[-1][1] or t.x1 < cols[0][0]]
|
||||||
|
inner_text.extend(outer_text)
|
||||||
|
cols = _add_columns(cols, inner_text, self.ytol)
|
||||||
|
cols = _join_columns(cols, width)
|
||||||
|
|
||||||
for pkey in sorted(self.debug_text.keys()):
|
pdf_page = {}
|
||||||
fig = plt.figure()
|
page_tables = {}
|
||||||
ax = fig.add_subplot(111, aspect='equal')
|
table_info = {}
|
||||||
xs, ys = [], []
|
table = Table(cols, rows)
|
||||||
for t in self.debug_text[pkey]:
|
rerror = []
|
||||||
xs.extend([t.x0, t.x1])
|
cerror = []
|
||||||
ys.extend([t.y0, t.y1])
|
for row in rows_grouped:
|
||||||
ax.add_patch(
|
for t in row:
|
||||||
patches.Rectangle(
|
try:
|
||||||
(t.x0, t.y0),
|
r_idx, rass_error = get_row_index(t, rows)
|
||||||
t.x1 - t.x0,
|
except ValueError as e:
|
||||||
t.y1 - t.y0
|
# couldn't assign LTTextLH to any cell
|
||||||
)
|
vprint(e.message)
|
||||||
)
|
continue
|
||||||
ax.set_xlim(min(xs) - 10, max(xs) + 10)
|
try:
|
||||||
ax.set_ylim(min(ys) - 10, max(ys) + 10)
|
c_idx, cass_error = _get_column_index(t, cols)
|
||||||
plt.show()
|
except ValueError as e:
|
||||||
|
# couldn't assign LTTextLH to any cell
|
||||||
|
vprint(e.message)
|
||||||
|
continue
|
||||||
|
rerror.append(rass_error)
|
||||||
|
cerror.append(cass_error)
|
||||||
|
table.cells[r_idx][c_idx].add_text(
|
||||||
|
t.get_text().strip('\n'))
|
||||||
|
if guess:
|
||||||
|
score = get_score([[33, rerror], [33, cerror], [34, [len_non_mode / len(elements)]]])
|
||||||
|
else:
|
||||||
|
score = get_score([[50, rerror], [50, cerror]])
|
||||||
|
table_info['score'] = score
|
||||||
|
ar = table.get_list()
|
||||||
|
ar = encode_list(ar)
|
||||||
|
table_info['data'] = ar
|
||||||
|
empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)
|
||||||
|
table_info['empty_p'] = empty_p
|
||||||
|
table_info['r_nempty_cells'] = r_nempty_cells
|
||||||
|
table_info['c_nempty_cells'] = c_nempty_cells
|
||||||
|
table_info['nrows'] = len(ar)
|
||||||
|
table_info['ncols'] = len(ar[0])
|
||||||
|
page_tables['table_1'] = table_info
|
||||||
|
pdf_page[os.path.basename(bname)] = page_tables
|
||||||
|
|
||||||
|
return pdf_page
|
||||||
|
|
@ -26,6 +26,7 @@ class Table:
|
||||||
self.rows = rows
|
self.rows = rows
|
||||||
self.cells = [[Cell(c[0], r[1], c[1], r[0])
|
self.cells = [[Cell(c[0], r[1], c[1], r[0])
|
||||||
for c in cols] for r in rows]
|
for c in cols] for r in rows]
|
||||||
|
self.nocont_ = 0
|
||||||
|
|
||||||
def set_edges(self, vertical, horizontal, jtol=2):
|
def set_edges(self, vertical, horizontal, jtol=2):
|
||||||
"""Sets cell edges to True if corresponding line segments
|
"""Sets cell edges to True if corresponding line segments
|
||||||
|
|
@ -53,6 +54,7 @@ class Table:
|
||||||
k = [k for k, t in enumerate(self.rows)
|
k = [k for k, t in enumerate(self.rows)
|
||||||
if np.isclose(v[1], t[0], atol=jtol)]
|
if np.isclose(v[1], t[0], atol=jtol)]
|
||||||
if not j:
|
if not j:
|
||||||
|
self.nocont_ += 1
|
||||||
continue
|
continue
|
||||||
J = j[0]
|
J = j[0]
|
||||||
if i == [0]: # only left edge
|
if i == [0]: # only left edge
|
||||||
|
|
@ -104,6 +106,7 @@ class Table:
|
||||||
k = [k for k, t in enumerate(self.cols)
|
k = [k for k, t in enumerate(self.cols)
|
||||||
if np.isclose(h[2], t[0], atol=jtol)]
|
if np.isclose(h[2], t[0], atol=jtol)]
|
||||||
if not j:
|
if not j:
|
||||||
|
self.nocont_ += 1
|
||||||
continue
|
continue
|
||||||
J = j[0]
|
J = j[0]
|
||||||
if i == [0]: # only top edge
|
if i == [0]: # only top edge
|
||||||
|
|
|
||||||
168
camelot/utils.py
168
camelot/utils.py
|
|
@ -1,5 +1,18 @@
|
||||||
|
from __future__ import division
|
||||||
|
import os
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
from pdfminer.pdfparser import PDFParser
|
||||||
|
from pdfminer.pdfdocument import PDFDocument
|
||||||
|
from pdfminer.pdfpage import PDFPage
|
||||||
|
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
|
||||||
|
from pdfminer.pdfinterp import PDFResourceManager
|
||||||
|
from pdfminer.pdfinterp import PDFPageInterpreter
|
||||||
|
from pdfminer.pdfdevice import PDFDevice
|
||||||
|
from pdfminer.converter import PDFPageAggregator
|
||||||
|
from pdfminer.layout import LAParams, LTChar, LTTextLineHorizontal
|
||||||
|
|
||||||
|
|
||||||
def translate(x1, x2):
|
def translate(x1, x2):
|
||||||
"""Translates x2 by x1.
|
"""Translates x2 by x1.
|
||||||
|
|
@ -243,15 +256,24 @@ def get_row_index(t, rows):
|
||||||
----------
|
----------
|
||||||
t : object
|
t : object
|
||||||
|
|
||||||
rows : list
|
rows : list, sorted in decreasing order
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
r : int
|
r : int
|
||||||
"""
|
"""
|
||||||
|
offset1, offset2 = 0, 0
|
||||||
for r in range(len(rows)):
|
for r in range(len(rows)):
|
||||||
if (t.y0 + t.y1) / 2.0 < rows[r][0] and (t.y0 + t.y1) / 2.0 > rows[r][1]:
|
if (t.y0 + t.y1) / 2.0 < rows[r][0] and (t.y0 + t.y1) / 2.0 > rows[r][1]:
|
||||||
return r
|
if t.y0 > rows[r][0]:
|
||||||
|
offset1 = abs(t.y0 - rows[r][0])
|
||||||
|
if t.y1 < rows[r][1]:
|
||||||
|
offset2 = abs(t.y1 - rows[r][1])
|
||||||
|
X = 1.0 if abs(t.x0 - t.x1) == 0.0 else abs(t.x0 - t.x1)
|
||||||
|
Y = 1.0 if abs(t.y0 - t.y1) == 0.0 else abs(t.y0 - t.y1)
|
||||||
|
charea = X * Y
|
||||||
|
error = (X * (offset1 + offset2)) / charea
|
||||||
|
return r, error
|
||||||
|
|
||||||
|
|
||||||
def get_column_index(t, columns):
|
def get_column_index(t, columns):
|
||||||
|
|
@ -268,9 +290,45 @@ def get_column_index(t, columns):
|
||||||
-------
|
-------
|
||||||
c : int
|
c : int
|
||||||
"""
|
"""
|
||||||
|
offset1, offset2 = 0, 0
|
||||||
for c in range(len(columns)):
|
for c in range(len(columns)):
|
||||||
if (t.x0 + t.x1) / 2.0 > columns[c][0] and (t.x0 + t.x1) / 2.0 < columns[c][1]:
|
if (t.x0 + t.x1) / 2.0 > columns[c][0] and (t.x0 + t.x1) / 2.0 < columns[c][1]:
|
||||||
return c
|
if t.x0 < columns[c][0]:
|
||||||
|
offset1 = abs(t.x0 - columns[c][0])
|
||||||
|
if t.x1 > columns[c][1]:
|
||||||
|
offset2 = abs(t.x1 - columns[c][1])
|
||||||
|
X = 1.0 if abs(t.x0 - t.x1) == 0.0 else abs(t.x0 - t.x1)
|
||||||
|
Y = 1.0 if abs(t.y0 - t.y1) == 0.0 else abs(t.y0 - t.y1)
|
||||||
|
charea = X * Y
|
||||||
|
error = (Y * (offset1 + offset2)) / charea
|
||||||
|
return c, error
|
||||||
|
|
||||||
|
|
||||||
|
def get_score(error_weights):
|
||||||
|
"""Calculates score based on weights assigned to various parameters,
|
||||||
|
and their error percentages.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
error_weights : dict
|
||||||
|
Dict with a tuple of error percentages as key and weightage
|
||||||
|
assigned to them as value. Sum of all values should be equal
|
||||||
|
to 100.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
score : float
|
||||||
|
"""
|
||||||
|
SCORE_VAL = 100
|
||||||
|
score = 0
|
||||||
|
if sum([ew[0] for ew in error_weights]) != SCORE_VAL:
|
||||||
|
raise ValueError("Please assign a valid weightage to each parameter"
|
||||||
|
" such that their sum is equal to 100")
|
||||||
|
for ew in error_weights:
|
||||||
|
weight = ew[0] / len(ew[1])
|
||||||
|
for error_percentage in ew[1]:
|
||||||
|
score += weight * (1 - error_percentage)
|
||||||
|
return score
|
||||||
|
|
||||||
|
|
||||||
def reduce_index(t, rotated, r_idx, c_idx):
|
def reduce_index(t, rotated, r_idx, c_idx):
|
||||||
|
|
@ -394,6 +452,110 @@ def remove_empty(d):
|
||||||
return d
|
return d
|
||||||
|
|
||||||
|
|
||||||
|
def count_empty(d):
|
||||||
|
"""Counts empty rows and columns from list of lists.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
d : list
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
n_empty_rows : number of empty rows
|
||||||
|
n_empty_cols : number of empty columns
|
||||||
|
empty_p : percentage of empty cells
|
||||||
|
"""
|
||||||
|
empty_p = 0
|
||||||
|
r_nempty_cells, c_nempty_cells = [], []
|
||||||
|
for i in d:
|
||||||
|
for j in i:
|
||||||
|
if j.strip() == '':
|
||||||
|
empty_p += 1
|
||||||
|
empty_p = 100 * (empty_p / float(len(d) * len(d[0])))
|
||||||
|
for row in d:
|
||||||
|
r_nempty_c = 0
|
||||||
|
for r in row:
|
||||||
|
if r.strip() != '':
|
||||||
|
r_nempty_c += 1
|
||||||
|
r_nempty_cells.append(r_nempty_c)
|
||||||
|
d = zip(*d)
|
||||||
|
d = [list(col) for col in d]
|
||||||
|
for col in d:
|
||||||
|
c_nempty_c = 0
|
||||||
|
for c in col:
|
||||||
|
if c.strip() != '':
|
||||||
|
c_nempty_c += 1
|
||||||
|
c_nempty_cells.append(c_nempty_c)
|
||||||
|
return empty_p, r_nempty_cells, c_nempty_cells
|
||||||
|
|
||||||
|
|
||||||
def encode_list(ar):
|
def encode_list(ar):
|
||||||
|
"""Encodes list of text.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
ar : list
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
ar : list
|
||||||
|
"""
|
||||||
ar = [[r.encode('utf-8') for r in row] for row in ar]
|
ar = [[r.encode('utf-8') for r in row] for row in ar]
|
||||||
return ar
|
return ar
|
||||||
|
|
||||||
|
|
||||||
|
def extract_text_objects(layout, LTObject, t=None):
|
||||||
|
"""Recursively parses pdf layout to get a list of
|
||||||
|
text objects.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
layout : object
|
||||||
|
Layout object.
|
||||||
|
|
||||||
|
LTObject : object
|
||||||
|
Text object, either LTChar or LTTextLineHorizontal.
|
||||||
|
|
||||||
|
t : list (optional, default: None)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
t : list
|
||||||
|
List of text objects.
|
||||||
|
"""
|
||||||
|
if t is None:
|
||||||
|
t = []
|
||||||
|
try:
|
||||||
|
for obj in layout._objs:
|
||||||
|
if isinstance(obj, LTObject):
|
||||||
|
t.append(obj)
|
||||||
|
else:
|
||||||
|
t += extract_text_objects(obj, LTObject)
|
||||||
|
except AttributeError:
|
||||||
|
pass
|
||||||
|
return t
|
||||||
|
|
||||||
|
|
||||||
|
def pdf_to_text(pname, char_margin, line_margin, word_margin):
|
||||||
|
# pkey = 'page-{0}'.format(p)
|
||||||
|
# pname = os.path.join(self.temp, '{}.pdf'.format(pkey))
|
||||||
|
with open(pname, 'r') as f:
|
||||||
|
parser = PDFParser(f)
|
||||||
|
document = PDFDocument(parser)
|
||||||
|
if not document.is_extractable:
|
||||||
|
raise PDFTextExtractionNotAllowed
|
||||||
|
laparams = LAParams(char_margin=char_margin,
|
||||||
|
line_margin=line_margin,
|
||||||
|
word_margin=word_margin)
|
||||||
|
rsrcmgr = PDFResourceManager()
|
||||||
|
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
|
||||||
|
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||||
|
for page in PDFPage.create_pages(document):
|
||||||
|
interpreter.process_page(page)
|
||||||
|
layout = device.get_result()
|
||||||
|
lattice_objects = extract_text_objects(layout, LTChar)
|
||||||
|
stream_objects = extract_text_objects(
|
||||||
|
layout, LTTextLineHorizontal)
|
||||||
|
width = layout.bbox[2]
|
||||||
|
height = layout.bbox[3]
|
||||||
|
return lattice_objects, stream_objects, width, height
|
||||||
|
|
@ -39,7 +39,7 @@ Usage
|
||||||
|
|
||||||
>>> extractor = Lattice(Pdf('us-030.pdf'))
|
>>> extractor = Lattice(Pdf('us-030.pdf'))
|
||||||
>>> tables = extractor.get_tables()
|
>>> tables = extractor.get_tables()
|
||||||
>>> print tables['pg-1']
|
>>> print tables['page-1'][0]
|
||||||
|
|
||||||
.. csv-table::
|
.. csv-table::
|
||||||
:header: "Cycle Name","KI (1/km)","Distance (mi)","Percent Fuel Savings","","",""
|
:header: "Cycle Name","KI (1/km)","Distance (mi)","Percent Fuel Savings","","",""
|
||||||
|
|
|
||||||
|
|
@ -65,7 +65,7 @@ Finally, the characters found on the page are assigned to cells based on their x
|
||||||
|
|
||||||
>>> extractor = Lattice(Pdf('us-030.pdf'))
|
>>> extractor = Lattice(Pdf('us-030.pdf'))
|
||||||
>>> tables = extractor.get_tables()
|
>>> tables = extractor.get_tables()
|
||||||
>>> print tables['pg-1']
|
>>> print tables['page-1'][0]
|
||||||
|
|
||||||
.. csv-table::
|
.. csv-table::
|
||||||
:header: "Cycle Name","KI (1/km)","Distance (mi)","Percent Fuel Savings","","",""
|
:header: "Cycle Name","KI (1/km)","Distance (mi)","Percent Fuel Savings","","",""
|
||||||
|
|
@ -114,7 +114,7 @@ In the PDF used above, you can see that some cells spanned a lot of rows, `fill`
|
||||||
|
|
||||||
>>> extractor = Lattice(Pdf('row_span_1.pdf'), fill='v', scale=40)
|
>>> extractor = Lattice(Pdf('row_span_1.pdf'), fill='v', scale=40)
|
||||||
>>> tables = extractor.get_tables()
|
>>> tables = extractor.get_tables()
|
||||||
>>> print tables['pg-1']
|
>>> print tables['page-1'][0]
|
||||||
|
|
||||||
.. csv-table::
|
.. csv-table::
|
||||||
:header: "Plan Type","County","Plan Name","Totals"
|
:header: "Plan Type","County","Plan Name","Totals"
|
||||||
|
|
@ -173,7 +173,7 @@ To find line segments, Lattice needs the lines of the PDF to be in foreground. S
|
||||||
|
|
||||||
>>> extractor = Lattice(Pdf('lines_in_background_1.pdf'), invert=True)
|
>>> extractor = Lattice(Pdf('lines_in_background_1.pdf'), invert=True)
|
||||||
>>> tables = extractor.get_tables()
|
>>> tables = extractor.get_tables()
|
||||||
>>> print tables['pg-1']
|
>>> print tables['page-1'][0]
|
||||||
|
|
||||||
.. csv-table::
|
.. csv-table::
|
||||||
:header: "State","Date","Halt stations","Halt days","Persons directly reached(in lakh)","Persons trained","Persons counseled","Persons testedfor HIV"
|
:header: "State","Date","Halt stations","Halt days","Persons directly reached(in lakh)","Persons trained","Persons counseled","Persons testedfor HIV"
|
||||||
|
|
|
||||||
|
|
@ -17,7 +17,7 @@ Let's run it on this PDF.
|
||||||
|
|
||||||
>>> extractor = Stream(Pdf('eu-027.pdf'))
|
>>> extractor = Stream(Pdf('eu-027.pdf'))
|
||||||
>>> tables = extractor.get_tables()
|
>>> tables = extractor.get_tables()
|
||||||
>>> print tables['pg-1']
|
>>> print tables['page-1'][0]
|
||||||
|
|
||||||
.. .. _this: insert link for eu-027.pdf
|
.. .. _this: insert link for eu-027.pdf
|
||||||
|
|
||||||
|
|
@ -68,7 +68,7 @@ But sometimes its guess could be incorrect, like in this case.
|
||||||
|
|
||||||
>>> extractor = Stream(Pdf('missing_values.pdf'))
|
>>> extractor = Stream(Pdf('missing_values.pdf'))
|
||||||
>>> tables = extractor.get_tables()
|
>>> tables = extractor.get_tables()
|
||||||
>>> print tables['pg-1']
|
>>> print tables['page-1'][0]
|
||||||
|
|
||||||
.. .. _this: insert link for missing_values.pdf
|
.. .. _this: insert link for missing_values.pdf
|
||||||
|
|
||||||
|
|
@ -127,7 +127,7 @@ It guessed that the PDF has 3 columns, because there wasn't any data in the last
|
||||||
|
|
||||||
>>> extractor = Stream(Pdf('missing_values.pdf'), ncolumns=5)
|
>>> extractor = Stream(Pdf('missing_values.pdf'), ncolumns=5)
|
||||||
>>> tables = extractor.get_tables()
|
>>> tables = extractor.get_tables()
|
||||||
>>> print tables['pg-1']
|
>>> print tables['page-1'][0]
|
||||||
|
|
||||||
.. csv-table::
|
.. csv-table::
|
||||||
|
|
||||||
|
|
@ -200,7 +200,7 @@ After getting the x-coordinates, we just need to pass them to Stream, like this.
|
||||||
|
|
||||||
>>> extractor = Stream(Pdf('mexican_towns.pdf'), columns='28,67,180,230,425,475,700')
|
>>> extractor = Stream(Pdf('mexican_towns.pdf'), columns='28,67,180,230,425,475,700')
|
||||||
>>> tables = extractor.get_tables()
|
>>> tables = extractor.get_tables()
|
||||||
>>> print tables['pg-1']
|
>>> print tables['page-1'][0]
|
||||||
|
|
||||||
.. csv-table::
|
.. csv-table::
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -26,7 +26,7 @@ def test_lattice_basic():
|
||||||
extractor = Lattice(Pdf(pdfname,
|
extractor = Lattice(Pdf(pdfname,
|
||||||
pagenos=[{'start': 2, 'end': 2}], clean=True))
|
pagenos=[{'start': 2, 'end': 2}], clean=True))
|
||||||
tables = extractor.get_tables()
|
tables = extractor.get_tables()
|
||||||
assert_equal(tables['pg-2'][0], data)
|
assert_equal(tables['page-2'][0], data)
|
||||||
|
|
||||||
|
|
||||||
def test_lattice_fill():
|
def test_lattice_fill():
|
||||||
|
|
@ -76,7 +76,7 @@ def test_lattice_fill():
|
||||||
pdfname = os.path.join(testdir, 'row_span_1.pdf')
|
pdfname = os.path.join(testdir, 'row_span_1.pdf')
|
||||||
extractor = Lattice(Pdf(pdfname, clean=True), fill='v', scale=40)
|
extractor = Lattice(Pdf(pdfname, clean=True), fill='v', scale=40)
|
||||||
tables = extractor.get_tables()
|
tables = extractor.get_tables()
|
||||||
assert_equal(tables['pg-1'][0], data)
|
assert_equal(tables['pagea-1'][0], data)
|
||||||
|
|
||||||
|
|
||||||
def test_lattice_invert():
|
def test_lattice_invert():
|
||||||
|
|
@ -94,4 +94,4 @@ def test_lattice_invert():
|
||||||
pdfname = os.path.join(testdir, 'lines_in_background_1.pdf')
|
pdfname = os.path.join(testdir, 'lines_in_background_1.pdf')
|
||||||
extractor = Lattice(Pdf(pdfname, clean=True), invert=True)
|
extractor = Lattice(Pdf(pdfname, clean=True), invert=True)
|
||||||
tables = extractor.get_tables()
|
tables = extractor.get_tables()
|
||||||
assert_equal(tables['pg-1'][1], data)
|
assert_equal(tables['page-1'][1], data)
|
||||||
|
|
@ -13,57 +13,62 @@ testdir = os.path.dirname(os.path.abspath(__file__))
|
||||||
def test_stream_basic():
|
def test_stream_basic():
|
||||||
|
|
||||||
data = [
|
data = [
|
||||||
["","","","",""],
|
["Clave","Nombre Entidad","Clave","Nombre Municipio","Clave","Nombre Localidad"],
|
||||||
["C Appendix C: Summary Statistics","","","",""],
|
["Entidad","","Municipio","","Localidad",""],
|
||||||
["","Table C1: Summary Statistics","","",""],
|
["01","Aguascalientes","001","Aguascalientes","0094","Granja Adelita"],
|
||||||
["","This table contains summary statistics for 2,012 respondents in SAVE 2009.","","",""],
|
["01","Aguascalientes","001","Aguascalientes","0096","Agua Azul"],
|
||||||
["Variable","Mean","Std. Dev. Min","","Max"],
|
["01","Aguascalientes","001","Aguascalientes","0100","Rancho Alegre"],
|
||||||
["Age","50.8","15.9","21","90"],
|
["01","Aguascalientes","001","Aguascalientes","0102","Los Arbolitos [Rancho]"],
|
||||||
["Men","0.47","0.50","0","1"],
|
["01","Aguascalientes","001","Aguascalientes","0104","Ardillas de Abajo (Las Ardillas)"],
|
||||||
["East","0.28","0.45","0","1"],
|
["01","Aguascalientes","001","Aguascalientes","0106","Arellano"],
|
||||||
["Rural","0.15","0.36","0","1"],
|
["01","Aguascalientes","001","Aguascalientes","0112","Bajío los Vázquez"],
|
||||||
["Married","0.57","0.50","0","1"],
|
["01","Aguascalientes","001","Aguascalientes","0113","Bajío de Montoro"],
|
||||||
["Single","0.21","0.40","0","1"],
|
["01","Aguascalientes","001","Aguascalientes","0114","Residencial San Nicolás [Baños la Cantera]"],
|
||||||
["Divorced","0.13","0.33","0","1"],
|
["01","Aguascalientes","001","Aguascalientes","0120","Buenavista de Peñuelas"],
|
||||||
["Widowed","0.08","0.26","0","1"],
|
["01","Aguascalientes","001","Aguascalientes","0121","Cabecita 3 Marías (Rancho Nuevo)"],
|
||||||
["Separated","0.03","0.16","0","1"],
|
["01","Aguascalientes","001","Aguascalientes","0125","Cañada Grande de Cotorina"],
|
||||||
["Partner","0.65","0.48","0","1"],
|
["01","Aguascalientes","001","Aguascalientes","0126","Cañada Honda [Estación]"],
|
||||||
["Employed","0.55","0.50","0","1"],
|
["01","Aguascalientes","001","Aguascalientes","0127","Los Caños"],
|
||||||
["Fulltime","0.34","0.47","0","1"],
|
["01","Aguascalientes","001","Aguascalientes","0128","El Cariñán"],
|
||||||
["Parttime","0.20","0.40","0","1"],
|
["01","Aguascalientes","001","Aguascalientes","0129","El Carmen [Granja]"],
|
||||||
["Unemployed","0.08","0.28","0","1"],
|
["01","Aguascalientes","001","Aguascalientes","0135","El Cedazo (Cedazo de San Antonio)"],
|
||||||
["Homemaker","0.19","0.40","0","1"],
|
["01","Aguascalientes","001","Aguascalientes","0138","Centro de Arriba (El Taray)"],
|
||||||
["Retired","0.28","0.45","0","1"],
|
["01","Aguascalientes","001","Aguascalientes","0139","Cieneguilla (La Lumbrera)"],
|
||||||
["Household size","2.43","1.22","1","9"],
|
["01","Aguascalientes","001","Aguascalientes","0141","Cobos"],
|
||||||
["Households with children","0.37","0.48","0","1"],
|
["01","Aguascalientes","001","Aguascalientes","0144","El Colorado (El Soyatal)"],
|
||||||
["Number of children","1.67","1.38","0","8"],
|
["01","Aguascalientes","001","Aguascalientes","0146","El Conejal"],
|
||||||
["Lower secondary education","0.08","0.27","0","1"],
|
["01","Aguascalientes","001","Aguascalientes","0157","Cotorina de Abajo"],
|
||||||
["Upper secondary education","0.60","0.49","0","1"],
|
["01","Aguascalientes","001","Aguascalientes","0162","Coyotes"],
|
||||||
["Post secondary, non tert. education","0.12","0.33","0","1"],
|
["01","Aguascalientes","001","Aguascalientes","0166","La Huerta (La Cruz)"],
|
||||||
["First stage tertiary education","0.17","0.38","0","1"],
|
["01","Aguascalientes","001","Aguascalientes","0170","Cuauhtémoc (Las Palomas)"],
|
||||||
["Other education","0.03","0.17","0","1"],
|
["01","Aguascalientes","001","Aguascalientes","0171","Los Cuervos (Los Ojos de Agua)"],
|
||||||
["Household income (Euro/month)","2,127","1,389","22","22,500"],
|
["01","Aguascalientes","001","Aguascalientes","0172","San José [Granja]"],
|
||||||
["Gross wealth - end of 2007 (Euro)","187,281","384,198","0","7,720,000"],
|
["01","Aguascalientes","001","Aguascalientes","0176","La Chiripa"],
|
||||||
["Gross financial wealth - end of 2007 (Euro)","38,855","114,128","0","2,870,000"],
|
["01","Aguascalientes","001","Aguascalientes","0182","Dolores"],
|
||||||
["","Source: SAVE 2008 and 2009, data is weighted and imputed.","","",""],
|
["01","Aguascalientes","001","Aguascalientes","0183","Los Dolores"],
|
||||||
["","","","","ECB"],
|
["01","Aguascalientes","001","Aguascalientes","0190","El Duraznillo"],
|
||||||
["","","","","Working Paper Series No 1299"],
|
["01","Aguascalientes","001","Aguascalientes","0191","Los Durón"],
|
||||||
["","","","","Febuary 2011"]
|
["01","Aguascalientes","001","Aguascalientes","0197","La Escondida"],
|
||||||
|
["01","Aguascalientes","001","Aguascalientes","0201","Brande Vin [Bodegas]"],
|
||||||
|
["01","Aguascalientes","001","Aguascalientes","0207","Valle Redondo"],
|
||||||
|
["01","Aguascalientes","001","Aguascalientes","0209","La Fortuna"],
|
||||||
|
["01","Aguascalientes","001","Aguascalientes","0212","Lomas del Gachupín"],
|
||||||
|
["01","Aguascalientes","001","Aguascalientes","0213","El Carmen (Gallinas Güeras) [Rancho]"],
|
||||||
|
["01","Aguascalientes","001","Aguascalientes","0216","La Gloria"],
|
||||||
|
["01","Aguascalientes","001","Aguascalientes","0226","Hacienda Nueva"],
|
||||||
]
|
]
|
||||||
|
|
||||||
pdfname = os.path.join(testdir,
|
pdfname = os.path.join(testdir, 'mexican_towns.pdf')
|
||||||
"tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-027.pdf")
|
extractor = Stream(Pdf(pdfname, pagenos=[{'start': 1, 'end': 1}],
|
||||||
extractor = Stream(Pdf(pdfname, pagenos=[{'start': 3, 'end': 3}],
|
|
||||||
clean=True))
|
clean=True))
|
||||||
tables = extractor.get_tables()
|
tables = extractor.get_tables()
|
||||||
assert_equal(tables['pg-3'][0], data)
|
assert_equal(tables['page-1'][0], data)
|
||||||
|
|
||||||
|
|
||||||
def test_stream_ncolumns():
|
def test_stream_ncolumns():
|
||||||
|
|
||||||
data = [
|
data = [
|
||||||
["","","","",""],
|
["Bhandara - Key Indicators","","","",""],
|
||||||
["","Bhandara - Key Indicators","","",""],
|
|
||||||
["","DLHS-4 (2012-13)","","DLHS-3 (2007-08)",""],
|
["","DLHS-4 (2012-13)","","DLHS-3 (2007-08)",""],
|
||||||
["Indicators","TOTAL","RURAL","TOTAL","RURAL"],
|
["Indicators","TOTAL","RURAL","TOTAL","RURAL"],
|
||||||
["Reported Prevalence of Morbidity","","","",""],
|
["Reported Prevalence of Morbidity","","","",""],
|
||||||
|
|
@ -105,21 +110,20 @@ def test_stream_ncolumns():
|
||||||
["Moderately High (Systolic >160 mm of Hg & Diastolic >100 mm of Hg ) .....................................","8.2","7.1","",""],
|
["Moderately High (Systolic >160 mm of Hg & Diastolic >100 mm of Hg ) .....................................","8.2","7.1","",""],
|
||||||
["Very High (Systolic >180 mm of Hg & Diastolic >110 mm of Hg ) ...............................................","3.7","3.1","",""],
|
["Very High (Systolic >180 mm of Hg & Diastolic >110 mm of Hg ) ...............................................","3.7","3.1","",""],
|
||||||
["14 Any anaemia below 11g/dl, severe anaemia below 7g/dl. 15 Excluding age group 19 years","","","",""],
|
["14 Any anaemia below 11g/dl, severe anaemia below 7g/dl. 15 Excluding age group 19 years","","","",""],
|
||||||
["","Chronic Illness :Any person with symptoms persisting for longer than one month is defined as suffering from chronic illness","","",""]
|
["Chronic Illness :Any person with symptoms persisting for longer than one month is defined as suffering from chronic illness","","","",""],
|
||||||
|
["","4","","",""]
|
||||||
]
|
]
|
||||||
pdfname = os.path.join(testdir, 'missing_values.pdf')
|
pdfname = os.path.join(testdir, 'missing_values.pdf')
|
||||||
extractor = Stream(Pdf(pdfname, char_margin=1.0, clean=True),
|
extractor = Stream(Pdf(pdfname, char_margin=1.0, clean=True),
|
||||||
ncolumns=5)
|
ncolumns=5)
|
||||||
tables = extractor.get_tables()
|
tables = extractor.get_tables()
|
||||||
assert_equal(tables['pg-1'][0], data)
|
assert_equal(tables['page-1'][0], data)
|
||||||
|
|
||||||
|
|
||||||
def test_stream_columns():
|
def test_stream_columns():
|
||||||
|
|
||||||
data = [
|
data = [
|
||||||
["","","","","",""],
|
["Clave","Nombre Entidad","Clave","Nombre Municipio","Clave","Nombre Localidad"],
|
||||||
["Clave","","Clave","","Clave",""],
|
|
||||||
["","Nombre Entidad","","Nombre Municipio","","Nombre Localidad"],
|
|
||||||
["Entidad","","Municipio","","Localidad",""],
|
["Entidad","","Municipio","","Localidad",""],
|
||||||
["01","Aguascalientes","001","Aguascalientes","0094","Granja Adelita"],
|
["01","Aguascalientes","001","Aguascalientes","0094","Granja Adelita"],
|
||||||
["01","Aguascalientes","001","Aguascalientes","0096","Agua Azul"],
|
["01","Aguascalientes","001","Aguascalientes","0096","Agua Azul"],
|
||||||
|
|
@ -160,10 +164,11 @@ def test_stream_columns():
|
||||||
["01","Aguascalientes","001","Aguascalientes","0209","La Fortuna"],
|
["01","Aguascalientes","001","Aguascalientes","0209","La Fortuna"],
|
||||||
["01","Aguascalientes","001","Aguascalientes","0212","Lomas del Gachupín"],
|
["01","Aguascalientes","001","Aguascalientes","0212","Lomas del Gachupín"],
|
||||||
["01","Aguascalientes","001","Aguascalientes","0213","El Carmen (Gallinas Güeras) [Rancho]"],
|
["01","Aguascalientes","001","Aguascalientes","0213","El Carmen (Gallinas Güeras) [Rancho]"],
|
||||||
["01","Aguascalientes","001","Aguascalientes","0216","La Gloria"]
|
["01","Aguascalientes","001","Aguascalientes","0216","La Gloria"],
|
||||||
|
["01","Aguascalientes","001","Aguascalientes","0226","Hacienda Nueva"],
|
||||||
]
|
]
|
||||||
pdfname = os.path.join(testdir, 'mexican_towns.pdf')
|
pdfname = os.path.join(testdir, 'mexican_towns.pdf')
|
||||||
extractor = Stream(Pdf(pdfname, clean=True),
|
extractor = Stream(Pdf(pdfname, clean=True),
|
||||||
columns='28,67,180,230,425,475,700')
|
columns='28,67,180,230,425,475,700')
|
||||||
tables = extractor.get_tables()
|
tables = extractor.get_tables()
|
||||||
assert_equal(tables['pg-1'][0], data)
|
assert_equal(tables['page-1'][0], data)
|
||||||
367
tools/camelot
367
tools/camelot
|
|
@ -4,8 +4,12 @@ import os
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
import logging
|
import logging
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
from docopt import docopt
|
from docopt import docopt
|
||||||
|
from collections import Counter
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
from PyPDF2 import PdfFileReader
|
from PyPDF2 import PdfFileReader
|
||||||
|
|
||||||
from camelot.pdf import Pdf
|
from camelot.pdf import Pdf
|
||||||
|
|
@ -22,12 +26,23 @@ usage:
|
||||||
options:
|
options:
|
||||||
-h, --help Show this screen.
|
-h, --help Show this screen.
|
||||||
-v, --version Show version.
|
-v, --version Show version.
|
||||||
|
-V, --verbose Verbose.
|
||||||
-p, --pages <pageno> Comma-separated list of page numbers.
|
-p, --pages <pageno> Comma-separated list of page numbers.
|
||||||
Example: -p 1,3-6,10 [default: 1]
|
Example: -p 1,3-6,10 [default: 1]
|
||||||
|
-P, --parallel Parallelize the parsing process.
|
||||||
-f, --format <format> Output format. (csv,tsv,html,json,xlsx) [default: csv]
|
-f, --format <format> Output format. (csv,tsv,html,json,xlsx) [default: csv]
|
||||||
-l, --log Print log to file.
|
-l, --log Log to file.
|
||||||
-V, --verbose Verbose.
|
|
||||||
-o, --output <directory> Output directory.
|
-o, --output <directory> Output directory.
|
||||||
|
-M, --cmargin <cmargin> Char margin. Chars closer than cmargin are
|
||||||
|
grouped together to form a word. [default: 2.0]
|
||||||
|
-L, --lmargin <lmargin> Line margin. Lines closer than lmargin are
|
||||||
|
grouped together to form a textbox. [default: 0.5]
|
||||||
|
-W, --wmargin <wmargin> Word margin. Insert blank spaces between chars
|
||||||
|
if distance between words is greater than word
|
||||||
|
margin. [default: 0.1]
|
||||||
|
-S, --save-info Save parsing info for each page to a file.
|
||||||
|
-X, --plot <dist> Plot distributions. (page,all,rc)
|
||||||
|
-Z, --summary Summarize metrics.
|
||||||
|
|
||||||
camelot methods:
|
camelot methods:
|
||||||
lattice Looks for lines between data.
|
lattice Looks for lines between data.
|
||||||
|
|
@ -47,12 +62,12 @@ options:
|
||||||
cells. Example: -F h, -F v, -F hv
|
cells. Example: -F h, -F v, -F hv
|
||||||
-s, --scale <scale> Scaling factor. Large scaling factor leads to
|
-s, --scale <scale> Scaling factor. Large scaling factor leads to
|
||||||
smaller lines being detected. [default: 15]
|
smaller lines being detected. [default: 15]
|
||||||
|
-i, --invert Invert pdf image to make sure that lines are
|
||||||
|
in foreground.
|
||||||
-j, --jtol <jtol> Tolerance to account for when comparing joint
|
-j, --jtol <jtol> Tolerance to account for when comparing joint
|
||||||
and line coordinates. [default: 2]
|
and line coordinates. [default: 2]
|
||||||
-m, --mtol <mtol> Tolerance to account for when merging lines
|
-m, --mtol <mtol> Tolerance to account for when merging lines
|
||||||
which are very close. [default: 2]
|
which are very close. [default: 2]
|
||||||
-i, --invert Invert pdf image to make sure that lines are
|
|
||||||
in foreground.
|
|
||||||
-d, --debug <debug> Debug by visualizing pdf geometry.
|
-d, --debug <debug> Debug by visualizing pdf geometry.
|
||||||
(contour,line,joint,table) Example: -d table
|
(contour,line,joint,table) Example: -d table
|
||||||
"""
|
"""
|
||||||
|
|
@ -69,17 +84,159 @@ options:
|
||||||
Example: -c 10.1,20.2,30.3
|
Example: -c 10.1,20.2,30.3
|
||||||
-y, --ytol <ytol> Tolerance to account for when grouping rows
|
-y, --ytol <ytol> Tolerance to account for when grouping rows
|
||||||
together. [default: 2]
|
together. [default: 2]
|
||||||
-M, --cmargin <cmargin> Char margin. Chars closer than cmargin are
|
-m, --mtol <mtol> Tolerance to account for when merging columns
|
||||||
grouped together to form a word. [default: 2.0]
|
together. [default: 2]
|
||||||
-L, --lmargin <lmargin> Line margin. Lines closer than lmargin are
|
|
||||||
grouped together to form a textbox. [default: 0.5]
|
|
||||||
-W, --wmargin <wmargin> Word margin. Insert blank spaces between chars
|
|
||||||
if distance between words is greater than word
|
|
||||||
margin. [default: 0.1]
|
|
||||||
-d, --debug Debug by visualizing textboxes.
|
-d, --debug Debug by visualizing textboxes.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def plot_table_barchart(r, c, p, pno, tno):
|
||||||
|
row_idx = [i + 1 for i, row in enumerate(r)]
|
||||||
|
col_idx = [i + 1 for i, col in enumerate(c)]
|
||||||
|
r_index = np.arange(len(r))
|
||||||
|
c_index = np.arange(len(c))
|
||||||
|
width = 0.7
|
||||||
|
|
||||||
|
plt.figure(figsize=(8, 6))
|
||||||
|
plt.subplot(2, 1, 1)
|
||||||
|
plt.title('Percentage of empty cells in table: {0:.2f}'.format(p))
|
||||||
|
plt.xlabel('row index')
|
||||||
|
plt.ylabel('number of non-empty cells in row')
|
||||||
|
plt.bar(r_index, r)
|
||||||
|
plt.xticks(r_index + width * 0.5, row_idx)
|
||||||
|
plt.ylim(0, len(c))
|
||||||
|
|
||||||
|
plt.subplot(2, 1, 2)
|
||||||
|
plt.xlabel('column index')
|
||||||
|
plt.ylabel('number of non-empty cells in column')
|
||||||
|
plt.bar(c_index, c)
|
||||||
|
plt.xticks(c_index + width * 0.5, col_idx)
|
||||||
|
plt.ylim(0, len(r))
|
||||||
|
plt.savefig(''.join([pno, '_', tno, '.png']), dpi=300)
|
||||||
|
|
||||||
|
|
||||||
|
def plot_all_barchart(data, output):
|
||||||
|
r_empty_cells = []
|
||||||
|
for page_number in data.keys():
|
||||||
|
page = data[page_number]
|
||||||
|
for table_number in page.keys():
|
||||||
|
table = page[table_number]
|
||||||
|
r_empty_cells.extend([r / float(table['ncols']) for r in table['r_nempty_cells']])
|
||||||
|
c = Counter(r_empty_cells)
|
||||||
|
if 0.0 not in c:
|
||||||
|
c.update({0.0: 0})
|
||||||
|
if 1.0 not in c:
|
||||||
|
c.update({1.0: 0})
|
||||||
|
|
||||||
|
plt.figure(figsize=(8, 6))
|
||||||
|
plt.xlabel('percentage of non-empty cells in a row')
|
||||||
|
plt.ylabel('percentage of rows processed')
|
||||||
|
row_p = [count / float(sum(c.values())) for count in c.values()]
|
||||||
|
plt.bar(c.keys(), row_p, align='center', width=0.05)
|
||||||
|
plt.ylim(0, 1.0)
|
||||||
|
plt.savefig(''.join([output, '_all.png']), dpi=300)
|
||||||
|
|
||||||
|
|
||||||
|
def plot_rc_piechart(data, output):
|
||||||
|
from matplotlib import cm
|
||||||
|
|
||||||
|
tables = 0
|
||||||
|
rows, cols = [], []
|
||||||
|
for page_number in data.keys():
|
||||||
|
page = data[page_number]
|
||||||
|
for table_number in page.keys():
|
||||||
|
table = page[table_number]
|
||||||
|
tables += 1
|
||||||
|
rows.append(table['nrows'])
|
||||||
|
cols.append(table['ncols'])
|
||||||
|
|
||||||
|
r = Counter(rows)
|
||||||
|
c = Counter(cols)
|
||||||
|
|
||||||
|
plt.figure(figsize=(8, 6))
|
||||||
|
cs1 = cm.Set1(np.arange(len(r)) / float(len(r)))
|
||||||
|
ax1 = plt.subplot(211, aspect='equal')
|
||||||
|
ax1.pie(r.values(), colors=cs1, labels=r.keys(), startangle=90)
|
||||||
|
ax1.set_title('row distribution across tables')
|
||||||
|
|
||||||
|
cs2 = cm.Set1(np.arange(len(c)) / float(len(c)))
|
||||||
|
ax2 = plt.subplot(212, aspect='equal')
|
||||||
|
ax2.pie(c.values(), colors=cs2, labels=c.keys(), startangle=90)
|
||||||
|
ax2.set_title('column distribution across tables')
|
||||||
|
plt.savefig(''.join([output, '_rc.png']), dpi=300)
|
||||||
|
|
||||||
|
|
||||||
|
def summary(data, p_time):
|
||||||
|
from operator import itemgetter
|
||||||
|
from itertools import groupby
|
||||||
|
|
||||||
|
scores = []
|
||||||
|
continuous_tables = []
|
||||||
|
total_tables = 0
|
||||||
|
for page_number in data.keys():
|
||||||
|
page = data[page_number]
|
||||||
|
total_tables += len(page.keys())
|
||||||
|
for table_number in page.keys():
|
||||||
|
table = page[table_number]
|
||||||
|
continuous_tables.append((page_number, table_number, table['ncols']))
|
||||||
|
scores.append(table['score'])
|
||||||
|
avg_score = np.mean(scores)
|
||||||
|
|
||||||
|
ct_pages = []
|
||||||
|
header_string = ""
|
||||||
|
if len(continuous_tables) > 1:
|
||||||
|
tables = sorted(continuous_tables, key=lambda x: (int(x[0][5:]), int(x[1][6:])))
|
||||||
|
for k, g in groupby(tables, key=itemgetter(2)):
|
||||||
|
g = list(g)
|
||||||
|
tables_same_ncols = set([int(t[0][5:]) for t in g])
|
||||||
|
tables_same_ncols = sorted(list(tables_same_ncols))
|
||||||
|
for K, G in groupby(enumerate(tables_same_ncols), key=lambda (i, x): i - x):
|
||||||
|
G = list(G)
|
||||||
|
ct_pages.append((str(G[0][1]), str(G[-1][1])))
|
||||||
|
|
||||||
|
result_headers = []
|
||||||
|
for ct in ct_pages:
|
||||||
|
header_idx = {}
|
||||||
|
possible_headers = []
|
||||||
|
ncols = 0
|
||||||
|
for page_number in range(int(ct[0]), int(ct[1]) + 1):
|
||||||
|
page = data['page-{0}'.format(page_number)]
|
||||||
|
for table_number in page.keys():
|
||||||
|
table = page[table_number]
|
||||||
|
ncols = table['ncols']
|
||||||
|
for i, row in enumerate(table['data']):
|
||||||
|
try:
|
||||||
|
header_idx[tuple(row)].append(i)
|
||||||
|
except KeyError:
|
||||||
|
header_idx[tuple(row)] = [i]
|
||||||
|
possible_headers = sorted(header_idx, key=lambda k: len(header_idx[k]), reverse=True)[:10]
|
||||||
|
possible_headers = filter(lambda z: len(z) == ncols,
|
||||||
|
[filter(lambda x: x != '', p_h) for p_h in possible_headers])
|
||||||
|
modes = []
|
||||||
|
for p_h in possible_headers:
|
||||||
|
try:
|
||||||
|
modes.append((p_h, max(set(header_idx[p_h]), key=header_idx[p_h].count)))
|
||||||
|
except KeyError:
|
||||||
|
pass
|
||||||
|
header = modes[modes.index(min(modes, key=lambda x: x[1]))][0]
|
||||||
|
result_headers.append(header)
|
||||||
|
|
||||||
|
header_string = "Multi-page table headers*:\n"
|
||||||
|
header_string = ''.join([header_string, '\n'.join(['pages {0} -> {1}{2}{3}'.format(
|
||||||
|
'-'.join([cr[0][0], cr[0][1]]), '"', '","'.join(cr[1]), '"') for cr in zip(
|
||||||
|
ct_pages, result_headers)])])
|
||||||
|
|
||||||
|
avg_time = "Time taken per page: {0:.2f} seconds\n".format(
|
||||||
|
p_time / float(len(data))) if len(data) != 1 else ""
|
||||||
|
equal_ncols = "\nMulti-page tables on*: {0}\n".format(
|
||||||
|
', '.join(['-'.join(ct) for ct in ct_pages])) if len(data) != 1 else ""
|
||||||
|
stats = [len(data), p_time, avg_time, total_tables, avg_score, equal_ncols]
|
||||||
|
stat_string = ("Pages processed: {0}\nTime taken: {1:.2f} seconds\n"
|
||||||
|
"{2}Tables found: {3}\nAverage score: {4:.2f}{5}".format(*stats))
|
||||||
|
|
||||||
|
print(''.join([stat_string, header_string]))
|
||||||
|
|
||||||
|
|
||||||
def convert_to_html(table):
|
def convert_to_html(table):
|
||||||
html = ''
|
html = ''
|
||||||
html = ''.join([html, '<table border="1">\n'])
|
html = ''.join([html, '<table border="1">\n'])
|
||||||
|
|
@ -99,23 +256,23 @@ def write_to_disk(data, f='csv', output=None, filename=None):
|
||||||
if f in ['csv', 'tsv']:
|
if f in ['csv', 'tsv']:
|
||||||
import csv
|
import csv
|
||||||
delimiter = ',' if f == 'csv' else '\t'
|
delimiter = ',' if f == 'csv' else '\t'
|
||||||
for page in sorted(data):
|
for page_number in sorted(data.keys()):
|
||||||
for table in range(len(data[page])):
|
for table_number in sorted(data[page_number].keys()):
|
||||||
dsvname = '{0}_table_{1}.{2}'.format(page, table + 1, f)
|
dsvname = '{0}.{1}'.format(''.join([page_number, '_', table_number]), f)
|
||||||
with open(os.path.join(output, dsvname), 'w') as outfile:
|
with open(os.path.join(output, dsvname), 'w') as outfile:
|
||||||
writer = csv.writer(
|
writer = csv.writer(
|
||||||
outfile, delimiter=delimiter, quoting=csv.QUOTE_ALL)
|
outfile, delimiter=delimiter, quoting=csv.QUOTE_ALL)
|
||||||
for row in data[page][table]:
|
for row in data[page_number][table_number]['data']:
|
||||||
writer.writerow(row)
|
writer.writerow(row)
|
||||||
elif f == 'html':
|
elif f == 'html':
|
||||||
htmlname = '{}.html'.format(froot)
|
htmlname = '{0}.html'.format(froot)
|
||||||
for page in sorted(data):
|
for page_number in sorted(data.keys()):
|
||||||
for table in range(len(data[page])):
|
for table_number in sorted(data[page_number].keys()):
|
||||||
with open(os.path.join(output, htmlname), 'a') as htmlfile:
|
with open(os.path.join(output, htmlname), 'a') as htmlfile:
|
||||||
htmlfile.write(convert_to_html(data[page][table]))
|
htmlfile.write(convert_to_html(data[page_number][table_number]['data']))
|
||||||
elif f == 'json':
|
elif f == 'json':
|
||||||
import json
|
import json
|
||||||
with open(os.path.join(output, '{}.json'.format(froot)), 'w') \
|
with open(os.path.join(output, '{0}.json'.format(froot)), 'w') \
|
||||||
as jsonfile:
|
as jsonfile:
|
||||||
json.dump(data, jsonfile)
|
json.dump(data, jsonfile)
|
||||||
elif f == 'xlsx':
|
elif f == 'xlsx':
|
||||||
|
|
@ -123,12 +280,12 @@ def write_to_disk(data, f='csv', output=None, filename=None):
|
||||||
from pyexcel_xlsx import save_data
|
from pyexcel_xlsx import save_data
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
xlsx_data = OrderedDict()
|
xlsx_data = OrderedDict()
|
||||||
for page in sorted(data):
|
for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
|
||||||
for table in range(len(data[page])):
|
for table_number in sorted(data[page_number].keys(), key=lambda x: int(x[6:])):
|
||||||
sheet_name = '{0}_table_{1}'.format(page, table + 1)
|
sheet_name = ''.join([page_number, '_', table_number])
|
||||||
xlsx_data.update({sheet_name:
|
xlsx_data.update({sheet_name:
|
||||||
[row for row in data[page][table]]})
|
[row for row in data[page_number][table_number]['data']]})
|
||||||
save_data(os.path.join(output, '{}.xlsx'.format(froot)), xlsx_data)
|
save_data(os.path.join(output, '{0}.xlsx'.format(froot)), xlsx_data)
|
||||||
except ImportError:
|
except ImportError:
|
||||||
print("link to install docs")
|
print("link to install docs")
|
||||||
|
|
||||||
|
|
@ -147,16 +304,17 @@ if __name__ == '__main__':
|
||||||
filename = args['<file>']
|
filename = args['<file>']
|
||||||
filedir = os.path.dirname(args['<file>'])
|
filedir = os.path.dirname(args['<file>'])
|
||||||
logname, __ = os.path.splitext(filename)
|
logname, __ = os.path.splitext(filename)
|
||||||
logname += '.log'
|
logname = ''.join([logname, '.log'])
|
||||||
|
scorename, __ = os.path.splitext(filename)
|
||||||
|
scorename = ''.join([scorename, '_info.csv'])
|
||||||
|
pngname, __ = os.path.splitext(filename)
|
||||||
|
|
||||||
if args['--log']:
|
if args['--log']:
|
||||||
|
FORMAT = '%(asctime)s - %(levelname)s - %(message)s'
|
||||||
if args['--output']:
|
if args['--output']:
|
||||||
logname = os.path.join(args['--output'], os.path.basename(logname))
|
logname = os.path.join(args['--output'], os.path.basename(logname))
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
filename=logname, filemode='w', level=logging.DEBUG)
|
filename=logname, filemode='w', format=FORMAT, level=logging.DEBUG)
|
||||||
else:
|
|
||||||
logging.basicConfig(
|
|
||||||
filename=logname, filemode='w', level=logging.DEBUG)
|
|
||||||
|
|
||||||
p = []
|
p = []
|
||||||
if args['--pages'] == '1':
|
if args['--pages'] == '1':
|
||||||
|
|
@ -173,47 +331,142 @@ if __name__ == '__main__':
|
||||||
else:
|
else:
|
||||||
p.append({'start': int(r), 'end': int(r)})
|
p.append({'start': int(r), 'end': int(r)})
|
||||||
|
|
||||||
|
margin_tuple = (float(args['--cmargin']), float(args['--lmargin']),
|
||||||
|
float(args['--wmargin']))
|
||||||
if args['<method>'] == 'lattice':
|
if args['<method>'] == 'lattice':
|
||||||
try:
|
try:
|
||||||
extractor = Lattice(Pdf(filename, pagenos=p, clean=True),
|
manager = Pdf(Lattice(
|
||||||
fill=args['--fill'],
|
fill=args['--fill'],
|
||||||
scale=int(args['--scale']),
|
scale=int(args['--scale']),
|
||||||
jtol=int(args['--jtol']),
|
invert=args['--invert'],
|
||||||
mtol=int(args['--mtol']),
|
jtol=int(args['--jtol']),
|
||||||
invert=args['--invert'],
|
mtol=int(args['--mtol']),
|
||||||
debug=args['--debug'],
|
pdf_margin=margin_tuple,
|
||||||
verbose=args['--verbose'])
|
debug=args['--debug']),
|
||||||
data = extractor.get_tables()
|
filename,
|
||||||
|
pagenos=p,
|
||||||
|
parallel=args['--parallel'],
|
||||||
|
clean=True)
|
||||||
|
data = manager.extract()
|
||||||
|
|
||||||
|
processing_time = time.time() - start_time
|
||||||
|
vprint("Finished processing in", processing_time, "seconds")
|
||||||
|
logging.info("Finished processing in " + str(processing_time) + " seconds")
|
||||||
|
|
||||||
|
if args['--plot']:
|
||||||
|
if args['--output']:
|
||||||
|
pngname = os.path.join(args['--output'], os.path.basename(pngname))
|
||||||
|
plot_type = args['--plot'].split(',')
|
||||||
|
if 'page' in plot_type:
|
||||||
|
for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
|
||||||
|
page = data[page_number]
|
||||||
|
for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
|
||||||
|
table = page[table_number]
|
||||||
|
plot_table_barchart(table['r_nempty_cells'],
|
||||||
|
table['c_nempty_cells'],
|
||||||
|
table['empty_p'],
|
||||||
|
page_number,
|
||||||
|
table_number)
|
||||||
|
|
||||||
|
if 'all' in plot_type:
|
||||||
|
plot_all_barchart(data, pngname)
|
||||||
|
|
||||||
|
if 'rc' in plot_type:
|
||||||
|
plot_rc_piechart(data, pngname)
|
||||||
|
|
||||||
|
if args['--summary']:
|
||||||
|
summary(data, processing_time)
|
||||||
|
|
||||||
|
if args['--save-info']:
|
||||||
|
if args['--output']:
|
||||||
|
scorename = os.path.join(args['--output'], os.path.basename(scorename))
|
||||||
|
with open(scorename, 'w') as score_file:
|
||||||
|
score_file.write('table,nrows,ncols,empty_p,line_p,text_p,score\n')
|
||||||
|
for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
|
||||||
|
page = data[page_number]
|
||||||
|
for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
|
||||||
|
table = page[table_number]
|
||||||
|
score_file.write('{0},{1},{2},{3},{4},{5},{6}\n'.format(
|
||||||
|
''.join([page_number, '_', table_number]),
|
||||||
|
table['nrows'],
|
||||||
|
table['ncols'],
|
||||||
|
table['empty_p'],
|
||||||
|
table['line_p'],
|
||||||
|
table['text_p'],
|
||||||
|
table['score']))
|
||||||
if args['--debug']:
|
if args['--debug']:
|
||||||
extractor.plot_geometry(args['--debug'])
|
manager.debug_plot()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.exception(e.message, exc_info=True)
|
logging.exception(e.message, exc_info=True)
|
||||||
sys.exit()
|
sys.exit()
|
||||||
elif args['<method>'] == 'stream':
|
elif args['<method>'] == 'stream':
|
||||||
try:
|
try:
|
||||||
extractor = Stream(Pdf(filename, pagenos=p,
|
manager = Pdf(Stream(
|
||||||
char_margin=float(args['--cmargin']),
|
ncolumns=int(args['--ncols']),
|
||||||
line_margin=float(args['--lmargin']),
|
columns=args['--columns'],
|
||||||
word_margin=float(args['--wmargin']),
|
ytol=int(args['--ytol']),
|
||||||
clean=True),
|
mtol=int(args['--mtol']),
|
||||||
ncolumns=int(args['--ncols']),
|
pdf_margin=margin_tuple,
|
||||||
columns=args['--columns'],
|
debug=args['--debug']),
|
||||||
ytol=int(args['--ytol']),
|
filename,
|
||||||
debug=args['--debug'],
|
pagenos=p,
|
||||||
verbose=args['--verbose'])
|
parallel=args['--parallel'],
|
||||||
data = extractor.get_tables()
|
clean=True)
|
||||||
|
data = manager.extract()
|
||||||
|
|
||||||
|
processing_time = time.time() - start_time
|
||||||
|
vprint("Finished processing in", processing_time, "seconds")
|
||||||
|
logging.info("Finished processing in " + str(processing_time) + " seconds")
|
||||||
|
|
||||||
|
if args['--plot']:
|
||||||
|
if args['--output']:
|
||||||
|
pngname = os.path.join(args['--output'], os.path.basename(pngname))
|
||||||
|
plot_type = args['--plot'].split(',')
|
||||||
|
if 'page' in plot_type:
|
||||||
|
for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
|
||||||
|
page = data[page_number]
|
||||||
|
for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
|
||||||
|
table = page[table_number]
|
||||||
|
plot_table_barchart(table['r_nempty_cells'],
|
||||||
|
table['c_nempty_cells'],
|
||||||
|
table['empty_p'],
|
||||||
|
page_number,
|
||||||
|
table_number)
|
||||||
|
|
||||||
|
if 'all' in plot_type:
|
||||||
|
plot_all_barchart(data, pngname)
|
||||||
|
|
||||||
|
if 'rc' in plot_type:
|
||||||
|
plot_rc_piechart(data, pngname)
|
||||||
|
|
||||||
|
if args['--summary']:
|
||||||
|
summary(data, processing_time)
|
||||||
|
|
||||||
|
if args['--save-info']:
|
||||||
|
if args['--output']:
|
||||||
|
scorename = os.path.join(args['--output'], os.path.basename(scorename))
|
||||||
|
with open(scorename, 'w') as score_file:
|
||||||
|
score_file.write('table,nrows,ncols,empty_p,,score\n')
|
||||||
|
for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
|
||||||
|
page = data[page_number]
|
||||||
|
for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
|
||||||
|
table = page[table_number]
|
||||||
|
score_file.write('{0},{1},{2},{3},{4}\n'.format(
|
||||||
|
''.join([page_number, '_', table_number]),
|
||||||
|
table['nrows'],
|
||||||
|
table['ncols'],
|
||||||
|
table['empty_p'],
|
||||||
|
table['score']))
|
||||||
|
|
||||||
if args['--debug']:
|
if args['--debug']:
|
||||||
extractor.plot_text()
|
manager.debug_plot()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.exception(e.message, exc_info=True)
|
logging.exception(e.message, exc_info=True)
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
|
||||||
if data is None:
|
if args['--debug']:
|
||||||
print("See 'camelot <method> -h' for various parameters you can tweak.")
|
print("See 'camelot <method> -h' for various parameters you can tweak.")
|
||||||
else:
|
else:
|
||||||
output = filedir if args['--output'] is None else args['--output']
|
output = filedir if args['--output'] is None else args['--output']
|
||||||
write_to_disk(data, f=args['--format'],
|
write_to_disk(data, f=args['--format'],
|
||||||
output=output, filename=filename)
|
output=output, filename=filename)
|
||||||
|
|
||||||
vprint("finished in", time.time() - start_time, "seconds")
|
|
||||||
logging.info("Time taken: " + str(time.time() - start_time) + " seconds")
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue