Modify command line tool

Precompute globs

Replace argparse with docopt

Fix CLI

Update .gitignore

Add docstrings

Update README

Fix typo

Replace zip subprocess call

Use tempfile

Fix newline
pull/2/head
Vinayak Mehta 2016-07-19 16:45:28 +05:30
parent 3045a92630
commit 271d4cafd6
11 changed files with 895 additions and 373 deletions

4
.gitignore vendored
View File

@ -1,3 +1,5 @@
__pycache__/
*.py[cod]
.camelot/
*.so
.camelot/

View File

@ -14,63 +14,24 @@ camelot also uses poppler-utils, more specifically `pdfseparate` to separate a p
## Usage
python2 camelot.py [options] file
<pre>
camelot.py [options] <method> [<args>...]
positional arguments:
options:
-h, --help Show this screen.
-v, --version Show version.
-p, --pages &lt;pageno&gt; Comma-separated list of page numbers.
Example: -p 1,3-6,10 [default: 1]
-f, --format &lt;format&gt; Output format. (csv,xlsx) [default: csv]
-l, --log Print log to file.
-o, --output &lt;directory&gt; Output directory.
file
camelot methods:
lattice Looks for lines between data.
stream Looks for spaces between data.
optional arguments:
-h, --help
show this help message and exit
-p, --pages PAGES [PAGES ...]
Specify the page numbers and/or page ranges to be
parsed. Example: -p="1 3-5 9", -p="all" (default: 1)
-f, --format FORMAT
Output format (csv/xlsx). Example: -f="xlsx" (default: csv)
-m, --spreadsheet
Extract tables with ruling lines. (default: False)
-F, --fill FILL
Fill the values in empty cells horizontally(h) and/or
vertically(v). Example: -F="h", -F="v", -F="hv" (default: None)
-s, --scale [SCALE]
Scaling factor. Large scaling factor leads to smaller
lines being detected. (default: 15)
-j, --jtol [JTOL]
Tolerance to account for when comparing joint and line
coordinates. (default: 2)
-M, --mtol [MTOL]
Tolerance to account for when merging lines which are
very close. (default: 2)
-i, --invert
Make sure lines are in foreground. (default: False)
-d, --debug DEBUG
Debug by visualizing contours, lines, joints, tables.
Example: --debug="contours"
-o, --output OUTPUT
Specify output directory.
See 'camelot <method> -h' for more information on a specific method.
</pre>
## Development

View File

@ -1,80 +0,0 @@
import os
import csv
import numpy as np
from pdf import get_pdf_info
def overlap(l):
merged = []
for higher in l:
if not merged:
merged.append(higher)
else:
lower = merged[-1]
if higher[0] <= lower[1]:
upper_bound = max(lower[1], higher[1])
lower_bound = min(lower[0], higher[0])
merged[-1] = (lower_bound, upper_bound)
else:
merged.append(higher)
return merged
def get_row_idx(t, rows):
for r in range(len(rows)):
if t.y1 <= rows[r][0] and t.y0 >= rows[r][1]:
return r
def get_column_idx(t, columns):
for c in range(len(columns)):
if t.x0 >= columns[c][0] and t.x1 <= columns[c][1]:
return c
def basic(pdf_dir, filename, char_margin, line_margin, word_margin):
print "working on", filename
text, _, _ = get_pdf_info(os.path.join(pdf_dir, filename), 'basic',
char_margin, line_margin, word_margin)
text.sort(key=lambda x: (-x.y0, x.x0))
y_last = 0
data = []
temp = []
elements = []
for t in text:
# is checking for upright necessary?
# if t.get_text().strip() and all([obj.upright for obj in t._objs if
# type(obj) is LTChar]):
if t.get_text().strip():
if not np.isclose(y_last, t.y0, atol=2):
y_last = t.y0
elements.append(len(temp))
data.append(temp)
temp = []
temp.append(t)
# a table can't have just 1 column, can it?
elements = filter(lambda x: x != 1, elements)
# mode = int(sys.argv[2]) if sys.argv[2] else max(set(elements), key=elements.count)
mode = max(set(elements), key=elements.count)
columns = [(t.x0, t.x1) for d in data for t in d if len(d) == mode]
columns = overlap(sorted(columns))
columns = [(c[0] + c[1]) / 2.0 for c in columns]
output = [['' for c in columns] for d in data]
for row, d in enumerate(data):
for t in d:
cog = (t.x0 + t.x1) / 2.0
diff = [(i, abs(cog - c)) for i, c in enumerate(columns)]
idx = min(diff, key=lambda x: x[1])
if output[row][idx[0]]:
output[row][idx[0]] += ' ' + t.get_text().strip()
else:
output[row][idx[0]] = t.get_text().strip()
csvname = filename.split('.')[0] + '.csv'
csvpath = os.path.join(pdf_dir, csvname)
with open(csvpath, 'w') as outfile:
writer = csv.writer(outfile, quoting=csv.QUOTE_ALL)
for row in output:
writer.writerow([cell.encode('utf-8') for cell in row])

340
camelot.py 100644 → 100755
View File

@ -1,136 +1,258 @@
#!/usr/bin/env python2
import os
import re
import csv
import sys
import glob
import time
import shutil
import logging
import zipfile
import tempfile
import subprocess
import argparse
from docopt import docopt
from werkzeug.utils import secure_filename
from basic import basic
from spreadsheet import spreadsheet
from lattice import lattice
from stream import stream
doc = """
camelot parses tables from PDFs!
usage:
camelot.py [options] <method> [<args>...]
options:
-h, --help Show this screen.
-v, --version Show version.
-p, --pages <pageno> Comma-separated list of page numbers.
Example: -p 1,3-6,10 [default: 1]
-f, --format <format> Output format. (csv,xlsx) [default: csv]
-l, --log Print log to file.
-o, --output <directory> Output directory.
camelot methods:
lattice Looks for lines between data.
stream Looks for spaces between data.
See 'camelot <method> -h' for more information on a specific method.
"""
lattice_doc = """
Lattice method looks for lines between data to form a table.
usage:
camelot.py lattice [options] [--] <file>
options:
-F, --fill <fill> Fill data in horizontal and/or vertical spanning
cells. Example: -F h, -F v, -F hv
-s, --scale <scale> Scaling factor. Large scaling factor leads to
smaller lines being detected. [default: 15]
-j, --jtol <jtol> Tolerance to account for when comparing joint
and line coordinates. [default: 2]
-m, --mtol <mtol> Tolerance to account for when merging lines
which are very close. [default: 2]
-i, --invert Invert pdf image to make sure that lines are
in foreground.
-d, --debug <debug> Debug by visualizing pdf geometry.
(contour,line,joint,table) Example: -d table
"""
stream_doc = """
Stream method looks for spaces between data to form a table.
usage:
camelot.py stream [options] [--] <file>
options:
-n, --ncols <ncols> Number of columns. [default: 0]
-c, --columns <columns> Comma-separated list of column x-coordinates.
Example: -c 10.1,20.2,30.3
-M, --cmargin <cmargin> Char margin. Chars closer than cmargin are
grouped together to form a word. [default: 2.0]
-L, --lmargin <lmargin> Line margin. Lines closer than lmargin are
grouped together to form a textbox. [default: 0.5]
-W, --wmargin <wmargin> Word margin. Insert blank spaces between chars
if distance between words is greater than word
margin. [default: 0.1]
-d, --debug Debug by visualizing textboxes.
"""
pno = re.compile(r'\d+')
def mkdir(directory):
if not os.path.isdir(directory):
os.makedirs(directory)
def filesort(filename):
filename = filename.split('/')[-1]
def filesort(filepath):
filename = os.path.basename(filepath)
num = pno.findall(filename)
if len(num) == 2:
return (int(num[0]), int(num[1]))
else:
return (int(num[0]), 0)
start_time = time.time()
CAMELOT_DIR = '.camelot/'
mkdir(CAMELOT_DIR)
parser = argparse.ArgumentParser(
description='Parse tables from pdfs!', usage='python2 camelot.py [options] file')
parser.add_argument('-p', '--pages', nargs='+', action='store', dest='pages',
help='Specify the page numbers and/or page ranges to be parsed. Example: -p="1 3-5 9", -p="all" (default: 1)')
parser.add_argument('-f', '--format', nargs=1, action='store', dest='format',
help='Output format (csv/xlsx). Example: -f="xlsx" (default: csv)', default=["csv"])
parser.add_argument('-s', '--spreadsheet', action='store_true', dest='spreadsheet',
help='Extract tables with ruling lines. (default: False)')
parser.add_argument('-i', '--fill', action='store', dest='fill',
help='Fill the values in empty cells horizontally(h) and/or vertically(v). Example: -F="h", -F="v", -F="hv" (default: None)', default=None)
parser.add_argument('-c', '--scale', nargs='?', action='store', dest='scale',
help='Scaling factor. Large scaling factor leads to smaller lines being detected. (default: 15)', default=15, type=int)
parser.add_argument('-j', '--jtol', nargs='?', action='store',
dest='jtol', help='Tolerance to account for when comparing joint and line coordinates. (default: 2)', default=2, type=int)
parser.add_argument('-t', '--mtol', nargs='?', action='store',
dest='mtol', help='Tolerance to account for when merging lines which are very close. (default: 2)', default=2, type=int)
parser.add_argument('-n', '--invert', action='store_true', dest='invert',
help='Make sure lines are in foreground. (default: False)')
parser.add_argument('-d', '--debug', nargs=1, action='store', dest='debug',
help='Debug by visualizing contours, lines, joints, tables. Example: --debug="contours"')
parser.add_argument('-M', '--char-margin', nargs='?', action='store', dest='char_margin',
help='(default: 2.0)', default=2.0, type=float)
parser.add_argument('-L', '--line-margin', nargs='?', action='store', dest='line_margin',
help='(default: 0.5)', default=0.5, type=float)
parser.add_argument('-W', '--word-margin', nargs='?', action='store', dest='word_margin',
help='(default: 0.1)', default=0.1, type=float)
parser.add_argument('-o', '--output', nargs=1, action='store', dest='output',
help='Specify output directory.')
parser.add_argument('file', nargs=1)
if __name__ == '__main__':
start_time = time.time()
tmpdir = tempfile.mkdtemp()
result = parser.parse_args()
args = docopt(doc, version='0.1', options_first=True)
argv = [args['<method>']] + args['<args>']
if args['<method>'] == 'lattice':
args.update(docopt(lattice_doc, argv=argv))
elif args['<method>'] == 'stream':
args.update(docopt(stream_doc, argv=argv))
if result.pages:
if result.pages == ['all']:
p = result.pages
if args['--pages']:
if args['--pages'] == ['all']:
p = args['--pages']
else:
p = []
for r in args['--pages'].split(','):
if '-' in r:
a, b = r.split('-')
a, b = int(a), int(b)
p.extend([str(i) for i in range(a, b + 1)])
else:
p.extend([str(r)])
else:
p = []
for r in result.pages[0].split(' '):
if '-' in r:
a, b = r.split('-')
a, b = int(a), int(b)
p.extend([str(i) for i in range(a, b + 1)])
else:
p.extend([str(r)])
else:
p = ['1']
p = sorted(set(p))
p = ['1']
p = sorted(set(p))
filename = result.file[0].split('/')[-1]
# pdf_dir = os.path.join(CAMELOT_DIR, os.urandom(16).encode('hex'))
pdf_dir = os.path.join(CAMELOT_DIR, filename.split('.')[0])
mkdir(pdf_dir)
logging.basicConfig(filename=os.path.join(pdf_dir, filename.split('.')[
0] + '.log'), filemode='w', level=logging.DEBUG)
fname = os.path.basename(args['<file>'])
fname = secure_filename(fname)
fdir = os.path.dirname(args['<file>'])
froot, fext = os.path.splitext(fname)
if fext.lower() != '.pdf':
print "camelot can parse only pdfs right now"
sys.exit()
shutil.copy(result.file[0], os.path.join(pdf_dir, filename))
print "separating pdf into pages"
print
if p == ['all']:
subprocess.call(['pdfseparate', os.path.join(
pdf_dir, filename), os.path.join(pdf_dir, 'pg-%d.pdf')])
else:
for page in p:
subprocess.call(['pdfseparate', '-f', page, '-l', page, os.path.join(
pdf_dir, filename), os.path.join(pdf_dir, 'pg-' + page + '.pdf')])
logfname = os.path.join(tmpdir, froot + '.log')
logging.basicConfig(filename=logfname, filemode='w', level=logging.DEBUG)
if result.spreadsheet:
print "using the spreadsheet method"
for g in sorted(glob.glob(os.path.join(pdf_dir, 'pg-*.pdf'))):
print "converting", g.split('/')[-1], "to image"
os.system(' '.join(['convert', '-density', '300',
g, '-depth', '8', g[:-4] + '.png']))
try:
spreadsheet(pdf_dir, g.split('/')[-1], result.fill, result.scale,
result.jtol, result.mtol, result.invert, result.debug,
result.char_margin, result.line_margin, result.word_margin)
except:
logging.error("Couldn't parse " + g.split('/')[-1])
print "Couldn't parse", g.split('/')[-1]
else:
print "using the basic method"
for g in sorted(glob.glob(os.path.join(pdf_dir, 'pg-*.pdf'))):
basic(pdf_dir, g.split('/')[-1], result.char_margin, result.line_margin, result.word_margin)
if result.format == ['xlsx']:
import csv
from pyexcel_xlsx import save_data
from collections import OrderedDict
data = OrderedDict()
for c in sorted(glob.glob(os.path.join(pdf_dir, '*.csv')), key=filesort):
print "adding", c.split('/')[-1], "to excel file"
with open(c, 'r') as csvfile:
reader = csv.reader(csvfile)
data.update({c.split('/')[-1].split('.')
[0]: [row for row in reader]})
xlsxname = filename.split('.')[0] + '.xlsx'
xlsxpath = os.path.join(pdf_dir, xlsxname)
save_data(xlsxpath, data)
shutil.copy(args['<file>'], os.path.join(tmpdir, fname))
print "separating pdf into pages"
print
print "saved as", xlsxname
if p == ['all']:
subprocess.call(['pdfseparate', os.path.join(tmpdir, fname), os.path.join(tmpdir,
'pg-%d.pdf')])
else:
for page in p:
subprocess.call(['pdfseparate', '-f', page, '-l', page, os.path.join(tmpdir, fname),
os.path.join(tmpdir, 'pg-%s.pdf' % page)])
print "finished in", time.time() - start_time, "seconds"
logging.info("Time taken for " + filename + ": " +
str(time.time() - start_time) + " seconds")
glob_pdf = sorted(glob.glob(os.path.join(tmpdir, 'pg-*.pdf')))
if args['<method>'] == 'lattice':
print "using the lattice method"
for g in glob_pdf:
g_fname = os.path.basename(g)
g_froot, __ = os.path.splitext(g)
print "converting %s to image" % g_fname
os.system(' '.join(['convert', '-density', '300',
g, '-depth', '8', g_froot + '.png']))
try:
data = lattice(g, f=args['--fill'], s=int(args['--scale']),
jtol=int(args['--jtol']), mtol=int(args['--mtol']),
invert=args['--invert'], debug=args['--debug'])
if data is None:
print
print "See 'camelot lattice -h' for various parameters you can tweak."
sys.exit()
for k in sorted(data.keys()):
csvfile = g_froot + '_%s.csv' % k
with open(csvfile, 'w') as outfile:
writer = csv.writer(outfile)
for d in data[k]:
writer.writerow([c.encode('utf-8') for c in d])
print "saved as", os.path.basename(csvfile)
print
except Exception:
logging.exception("")
print "couldn't parse", g_fname, "see log for more info"
print
elif args['<method>'] == 'stream':
print "using the stream method"
for g in glob_pdf:
g_fname = os.path.basename(g)
g_froot, __ = os.path.splitext(g)
try:
data = stream(g, ncolumns=int(args['--ncols']), columns=args['--columns'],
char_margin=float(args['--cmargin']),
line_margin=float(args['--lmargin']),
word_margin=float(args['--wmargin']),
debug=args['--debug'])
if data is None:
print
print "See 'camelot stream -h' for various parameters you can tweak."
sys.exit()
csvfile = g_froot + '.csv'
with open(csvfile, 'w') as outfile:
writer = csv.writer(outfile)
for d in data:
writer.writerow([c.encode('utf-8') for c in d])
print "saved as", os.path.basename(csvfile)
print
except Exception:
logging.exception("")
print "couldn't parse", g_fname, "see log for more info"
print
glob_csv = sorted(glob.glob(os.path.join(tmpdir, '*.csv')), key=filesort)
if args['--format'] == 'csv':
if len(glob_csv) == 1:
if args['--output']:
shutil.copy(glob_csv[0], args['--output'])
if args['--log']:
shutil.copy(logfname, args['--output'])
else:
shutil.copy(glob_csv[0], fdir)
if args['--log']:
shutil.copy(zippath, fdir)
else:
zipname = froot + '.zip'
zippath = os.path.join(tmpdir, zipname)
print "zipping 'em up"
with zipfile.ZipFile(zippath, 'a', zipfile.ZIP_DEFLATED) as myzip:
for g in glob_csv:
myzip.write(g, os.path.join(froot, os.path.basename(g)))
if args['--output']:
shutil.copy(zippath, args['--output'])
if args['--log']:
shutil.copy(logfname, args['--output'])
else:
shutil.copy(zippath, fdir)
if args['--log']:
shutil.copy(zippath, fdir)
print
elif args['--format'] == 'xlsx':
from pyexcel_xlsx import save_data
from collections import OrderedDict
data = OrderedDict()
for c in glob_csv:
c_fname = os.path.basename(c)
c_froot, __ = os.path.splitext(c)
print "adding", c_fname, "to excel file"
with open(c, 'r') as csvfile:
reader = csv.reader(csvfile)
c_froot, __ = os.path.splitext(c_fname)
data.update({c_froot: [row for row in reader]})
xlsxname = froot + '.xlsx'
xlsxpath = os.path.join(tmpdir, xlsxname)
save_data(xlsxpath, data)
if args['--output']:
shutil.copy(xlsxpath, args['--output'])
if args['--log']:
shutil.copy(logfname, args['--output'])
else:
shutil.copy(xlsxpath, fdir)
if args['--log']:
shutil.copy(zippath, fdir)
print
print "saved as", xlsxname
print "cleaning up..."
shutil.rmtree(tmpdir)
print "finished in", time.time() - start_time, "seconds"
logging.info("Time taken for " + fname + ": " +
str(time.time() - start_time) + " seconds")

58
cell.py
View File

@ -1,6 +1,44 @@
class Cell:
"""Cell
Parameters
----------
x1 : int
y1 : int
x2 : int
y2 : int
Attributes
----------
lb : tuple
lt : tuple
rb : tuple
rt : tuple
bbox : tuple
left : bool
right : bool
top : bool
bottom : bool
text : string
spanning_h : bool
spanning_v : bool
"""
def __init__(self, x1, y1, x2, y2):
self.lb = (x1, y1)
self.lt = (x1, y2)
self.rb = (x2, y1)
@ -15,10 +53,28 @@ class Cell:
self.spanning_v = False
def add_text(self, text):
self.text += text
"""Add text to cell object.
Parameters
----------
text : string
"""
self.text = ''.join([self.text, text])
def get_text(self):
"""Get text from cell object.
Returns
-------
text : string
"""
return self.text
def get_bounded_edges(self):
"""Get number of edges by which a cell is bounded.
Returns
-------
bounded_edges : int
"""
return self.top + self.bottom + self.left + self.right

View File

@ -1,37 +1,173 @@
import os
import csv
import cv2
import glob
import numpy as np
from table import Table
from pdf import get_pdf_info
from morph_transform import morph_transform
from utils import (translate, scale, merge_close_values, get_row_idx,
get_column_idx, reduce_index, outline, fill, remove_empty)
def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug,
char_margin, line_margin, word_margin):
def morph_transform(img, s=15, invert=False):
"""Morphological Transformation
Applies a series of morphological operations on the image
to find table contours and line segments.
http://answers.opencv.org/question/63847/how-to-extract-tables-from-an-image/
Empirical result for adaptiveThreshold's blockSize=5 and C=-0.2
taken from http://pequan.lip6.fr/~bereziat/pima/2012/seuillage/sezgin04.pdf
Parameters
----------
img : ndarray
s : int, default: 15, optional
Scaling factor. Large scaling factor leads to smaller lines
being detected.
invert : bool, default: False, optional
Invert pdf image to make sure that lines are in foreground.
Returns
-------
tables : dict
Dictionary with table bounding box as key and list of
joints found in the table as value.
v_segments : list
List of vertical line segments found in the image.
h_segments : list
List of horizontal line segments found in the image.
"""
img_x, img_y = img.shape[1], img.shape[0]
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
if invert:
threshold = cv2.adaptiveThreshold(
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -0.2)
else:
threshold = cv2.adaptiveThreshold(np.invert(
gray), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -0.2)
vertical = threshold
horizontal = threshold
scale = s
verticalsize = vertical.shape[0] / scale
horizontalsize = horizontal.shape[1] / scale
ver = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize))
hor = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontalsize, 1))
vertical = cv2.erode(vertical, ver, (-1, -1))
vertical = cv2.dilate(vertical, ver, (-1, -1))
horizontal = cv2.erode(horizontal, hor, (-1, -1))
horizontal = cv2.dilate(horizontal, hor, (-1, -1))
mask = vertical + horizontal
joints = np.bitwise_and(vertical, horizontal)
__, contours, __ = cv2.findContours(
mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
tables = {}
for c in contours:
c_poly = cv2.approxPolyDP(c, 3, True)
x, y, w, h = cv2.boundingRect(c_poly)
# find number of non-zero values in joints using what boundingRect
# returns
roi = joints[y : y + h, x : x + w]
__, jc, __ = cv2.findContours(
roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
if len(jc) <= 4: # remove contours with less than <=4 joints
continue
joint_coords = []
for j in jc:
jx, jy, jw, jh = cv2.boundingRect(j)
c1, c2 = x + (2 * jx + jw) / 2, y + (2 * jy + jh) / 2
joint_coords.append((c1, c2))
tables[(x, y + h, x + w, y)] = joint_coords
v_segments, h_segments = [], []
_, vcontours, _ = cv2.findContours(
vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
for vc in vcontours:
x, y, w, h = cv2.boundingRect(vc)
x1, x2 = x, x + w
y1, y2 = y, y + h
v_segments.append(((x1 + x2) / 2, y2, (x1 + x2) / 2, y1))
_, hcontours, _ = cv2.findContours(
horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
for hc in hcontours:
x, y, w, h = cv2.boundingRect(hc)
x1, x2 = x, x + w
y1, y2 = y, y + h
h_segments.append((x1, (y1 + y2) / 2, x2, (y1 + y2) / 2))
return tables, v_segments, h_segments
def lattice(filepath, f=None, s=15, jtol=2, mtol=2, invert=False, debug=None):
"""Lattice algorithm
Makes table using pdf geometry information returned by
morph_transform and fills data returned by PDFMiner in table cells.
Parameters
----------
filepath : string
f : string, default: None, optional
Fill data in horizontal and/or vertical spanning
cells. ('h', 'v', 'hv')
s : int, default: 15, optional
Scaling factor. Large scaling factor leads to smaller lines
being detected.
jtol : int, default: 2, optional
Tolerance to account for when comparing joint and line
coordinates.
mtol : int, default: 2, optional
Tolerance to account for when merging lines which are
very close.
invert : bool, default: False, optional
Invert pdf image to make sure that lines are in foreground.
debug : string
Debug by visualizing pdf geometry.
('contour', 'line', 'joint', 'table')
Returns
-------
output : dict
Dictionary with table number as key and list of data as value.
"""
if debug:
import matplotlib.pyplot as plt
import matplotlib.patches as patches
filename = os.path.basename(filepath)
print "working on", filename
imagename = os.path.join(pdf_dir, filename.split('.')[0] + '.png')
fileroot, __ = os.path.splitext(filepath)
imagename = fileroot + '.png'
img = cv2.imread(imagename)
img_x, img_y = img.shape[1], img.shape[0]
text, pdf_x, pdf_y = get_pdf_info(
os.path.join(pdf_dir, filename), 'spreadsheet',
char_margin, line_margin, word_margin)
text, pdf_x, pdf_y = get_pdf_info(filepath, method='lattice')
scaling_factor_x = pdf_x / float(img_x)
scaling_factor_y = pdf_y / float(img_y)
tables, v_segments, h_segments = morph_transform(imagename, s, invert)
tables, v_segments, h_segments = morph_transform(img, s=s, invert=invert)
if debug == ["contours"]:
if debug == "contour":
for t in tables.keys():
cv2.rectangle(img, (t[0], t[1]), (t[2], t[3]), (255, 0, 0), 3)
plt.imshow(img)
if debug == ["joints"]:
plt.show()
return None
if debug == "joint":
x_coord = []
y_coord = []
for k in tables.keys():
@ -42,6 +178,8 @@ def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug,
plt.plot(x_coord, y_coord, 'ro')
plt.axis([0, max_x + 100, max_y + 100, 0])
plt.imshow(img)
plt.show()
return None
# detect if vertical
num_v = [t for t in text if (not t.upright) and t.get_text().strip()]
@ -80,7 +218,8 @@ def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug,
abs(translate(-img_y, h[3])), scaling_factor_y)
h_segments_new.append((x1, y1, x2, y2))
num_tables = 0
num_tables = 1
output = {}
# sort tables based on y-coord
for k in sorted(tables_new.keys(), key=lambda x: x[1], reverse=True):
# find rows and columns that lie in table
@ -91,19 +230,21 @@ def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug,
h_s = [h for h in h_segments_new if h[0] > lb[0] - 2 and h[2]
< rt[0] + 2 and lb[1] - 2 <= h[1] <= rt[1] + 2]
if debug == ["lines"]:
if debug == "line":
for v in v_s:
plt.plot([v[0], v[2]], [v[1], v[3]])
for h in h_s:
plt.plot([h[0], h[2]], [h[1], h[3]])
plt.show()
return None
columns, rows = zip(*tables_new[k])
columns, rows = list(columns), list(rows)
columns.extend([lb[0], rt[0]])
rows.extend([lb[1], rt[1]])
# sort horizontal and vertical segments
columns = merge_close_values(sorted(columns), mtol)
rows = merge_close_values(sorted(rows, reverse=True), mtol)
columns = merge_close_values(sorted(columns), mtol=mtol)
rows = merge_close_values(sorted(rows, reverse=True), mtol=mtol)
# make grid using x and y coord of shortlisted rows and columns
columns = [(columns[i], columns[i + 1])
for i in range(0, len(columns) - 1)]
@ -111,13 +252,13 @@ def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug,
table = Table(columns, rows)
# light up cell edges
table = table.set_edges(v_s, h_s, jtol)
table = table.set_edges(v_s, h_s, jtol=jtol)
# table set span method
table = table.set_spanning()
# TODO
# light up table border
table = outline(table)
if debug == ["tables"]:
if debug == "table":
for i in range(len(table.cells)):
for j in range(len(table.cells[i])):
if table.cells[i][j].left:
@ -132,8 +273,8 @@ def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug,
if table.cells[i][j].bottom:
plt.plot([table.cells[i][j].lb[0], table.cells[i][j].rb[0]],
[table.cells[i][j].lb[1], table.cells[i][j].rb[1]])
if debug:
plt.show()
return None
# fill text after sorting it
if not rotated:
@ -152,26 +293,20 @@ def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug,
r_idx, c_idx = reduce_index(table, rotated, r_idx, c_idx)
table.cells[r_idx][c_idx].add_text(t.get_text().strip('\n'))
if fill:
table = fill(table, fill)
if f is not None:
table = fill(table, f=f)
data = []
for i in range(len(table.cells)):
data.append([table.cells[i][j].get_text().strip().encode('utf-8')
for j in range(len(table.cells[i]))])
for j in range(len(table.cells[i]))])
if rotated == 'left':
data = zip(*data[::-1])
elif rotated == 'right':
data = zip(*data[::1])
data.reverse()
data = remove_empty(data)
csvname = filename.split(
'.')[0] + ('_table_%d' % (num_tables + 1)) + '.csv'
csvpath = os.path.join(pdf_dir, csvname)
with open(csvpath, 'w') as outfile:
writer = csv.writer(outfile, quoting=csv.QUOTE_ALL)
for d in data:
writer.writerow(d)
print "saved as", csvname
print
output['table_%d' % num_tables] = data
num_tables += 1
return output

View File

@ -1,75 +0,0 @@
import cv2
import numpy as np
def morph_transform(imagename, s, invert):
# http://answers.opencv.org/question/63847/how-to-extract-tables-from-an-image/
img = cv2.imread(imagename)
img_x, img_y = img.shape[1], img.shape[0]
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# empirical result taken from
# http://pequan.lip6.fr/~bereziat/pima/2012/seuillage/sezgin04.pdf
if invert:
threshold = cv2.adaptiveThreshold(
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -0.2)
else:
threshold = cv2.adaptiveThreshold(np.invert(
gray), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -0.2)
vertical = threshold
horizontal = threshold
scale = s
verticalsize = vertical.shape[0] / scale
horizontalsize = horizontal.shape[1] / scale
ver = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize))
hor = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontalsize, 1))
vertical = cv2.erode(vertical, ver, (-1, -1))
vertical = cv2.dilate(vertical, ver, (-1, -1))
horizontal = cv2.erode(horizontal, hor, (-1, -1))
horizontal = cv2.dilate(horizontal, hor, (-1, -1))
mask = vertical + horizontal
joints = np.bitwise_and(vertical, horizontal)
_, contours, _ = cv2.findContours(
mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
tables = {}
for c in contours:
c_poly = cv2.approxPolyDP(c, 3, True)
x, y, w, h = cv2.boundingRect(c_poly)
# find number of non-zero values in joints using what boundingRect
# returns
roi = joints[y:y + h, x:x + w]
_, jc, _ = cv2.findContours(
roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
if len(jc) <= 4: # remove contours with less than <=4 joints
continue
joint_coords = []
for j in jc:
jx, jy, jw, jh = cv2.boundingRect(j)
c1, c2 = x + (2 * jx + jw) / 2, y + (2 * jy + jh) / 2
joint_coords.append((c1, c2))
tables[(x, y + h, x + w, y)] = joint_coords
v_segments, h_segments = [], []
_, vcontours, _ = cv2.findContours(
vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
for vc in vcontours:
x, y, w, h = cv2.boundingRect(vc)
x1, x2 = x, x + w
y1, y2 = y, y + h
v_segments.append(((x1 + x2) / 2, y2, (x1 + x2) / 2, y1))
_, hcontours, _ = cv2.findContours(
horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
for hc in hcontours:
x, y, w, h = cv2.boundingRect(hc)
x1, x2 = x, x + w
y1, y2 = y, y + h
h_segments.append((x1, (y1 + y2) / 2, x2, (y1 + y2) / 2))
return tables, v_segments, h_segments

73
pdf.py
View File

@ -9,35 +9,86 @@ from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTChar, LTTextLineHorizontal
def parse_text_basic(layout, t=None):
def parse_text_stream(layout, t=None):
"""Recursively parse pdf layout to get a list of
LTTextHorizontal objects.
Parameters
----------
layout : object
t : list
Returns
-------
t : list
"""
if t is None:
t = []
try:
for obj in layout._objs:
if type(obj) is LTTextLineHorizontal:
if isinstance(obj, LTTextLineHorizontal):
t.append(obj)
else:
t += parse_text_basic(obj)
t += parse_text_stream(obj)
except AttributeError:
pass
return t
def parse_text_spreadsheet(layout, t=None):
def parse_text_lattice(layout, t=None):
"""Recursively parse pdf layout to get a list of
LTChar objects.
Parameters
----------
layout : object
t : list
Returns
-------
t : list
"""
if t is None:
t = []
try:
for obj in layout._objs:
if type(obj) is LTChar:
if isinstance(obj, LTChar):
t.append(obj)
else:
t += parse_text_spreadsheet(obj)
t += parse_text_lattice(obj)
except AttributeError:
pass
return t
def get_pdf_info(pdfname, method, char_margin, line_margin, word_margin):
def get_pdf_info(pdfname, method=None, char_margin=2.0, line_margin=0.5,
word_margin=0.1):
"""Get list of text objects along with pdf width and height.
Parameters
----------
pdfname : string
method : string
char_margin : float
line_margin : float
word_margin : float
Returns
-------
text : list
pdf_x : int
pdf_y : int
"""
if not method:
return None
with open(pdfname, 'r') as f:
parser = PDFParser(f)
document = PDFDocument(parser)
@ -52,9 +103,9 @@ def get_pdf_info(pdfname, method, char_margin, line_margin, word_margin):
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
layout = device.get_result()
if method == 'basic':
text = parse_text_basic(layout)
elif method == 'spreadsheet':
text = parse_text_spreadsheet(layout)
if method == 'stream':
text = parse_text_stream(layout)
elif method == 'lattice':
text = parse_text_lattice(layout)
pdf_x, pdf_y = layout.bbox[2], layout.bbox[3]
return text, pdf_x, pdf_y

143
stream.py 100644
View File

@ -0,0 +1,143 @@
import os
import numpy as np
from pdf import get_pdf_info
def overlap(l):
"""Groups overlapping columns and returns list with updated
columns boundaries.
Parameters
----------
l : list
List of column x-coordinates.
Returns
-------
merged : list
List of merged column x-coordinates.
"""
merged = []
for higher in l:
if not merged:
merged.append(higher)
else:
lower = merged[-1]
if higher[0] <= lower[1]:
upper_bound = max(lower[1], higher[1])
lower_bound = min(lower[0], higher[0])
merged[-1] = (lower_bound, upper_bound)
else:
merged.append(higher)
return merged
def stream(filepath, ncolumns=0, columns=None, char_margin=2.0,
line_margin=0.5, word_margin=0.1, debug=False):
"""Stream algorithm
Groups data returned by PDFMiner into rows and finds mode of the
number of elements in each row to guess number of columns.
Parameters
----------
filepath : string
ncolumns : int, default: 0, optional
Number of columns.
columns : string, default: None, optional
Comma-separated list of column x-coordinates.
char_margin : float, default: 2.0, optional
Char margin. Chars closer than cmargin are grouped together
to form a word.
line_margin : float, default: 0.5, optional
Line margin. Lines closer than lmargin are grouped together
to form a textbox.
word_margin : float, default: 0.1, optional
Word margin. Insert blank spaces between chars if distance
between words is greater than word margin.
debug : bool, default: False, optional
Debug by visualizing textboxes.
Returns
-------
output : list
"""
filename = os.path.basename(filepath)
print "working on", filename
text, __, __ = get_pdf_info(filepath, method='stream', char_margin=char_margin,
line_margin=line_margin, word_margin=word_margin)
text.sort(key=lambda x: (-x.y0, x.x0))
y_last = 0
data = []
temp = []
elements = []
for t in text:
# is checking for upright necessary?
# if t.get_text().strip() and all([obj.upright for obj in t._objs if
# type(obj) is LTChar]):
if t.get_text().strip():
if not np.isclose(y_last, t.y0, atol=2):
y_last = t.y0
elements.append(len(temp))
data.append(temp)
temp = []
temp.append(t)
if debug:
import matplotlib.pyplot as plt
import matplotlib.patches as patches
fig = plt.figure()
ax = fig.add_subplot(111, aspect='equal')
xs, ys = [], []
for d in data:
for t in d:
xs.extend([t.x0, t.x1])
ys.extend([t.y0, t.y1])
ax.add_patch(
patches.Rectangle(
(t.x0, t.y0),
t.x1 - t.x0,
t.y1 - t.y0
)
)
ax.set_xlim(min(xs) - 10, max(xs) + 10)
ax.set_ylim(min(ys) - 10, max(ys) + 10)
plt.show()
return None
if columns:
cols = [(float(columns[i]), float(columns[i + 1]))
for i in range(0, len(columns) - 1)]
cols = [(c[0] + c[1]) / 2.0 for c in cols]
else:
# a table can't have just 1 column, can it?
elements = filter(lambda x: x != 1, elements)
mode = ncolumns if ncolumns else max(set(elements), key=elements.count)
cols = [(t.x0, t.x1) for d in data for t in d if len(d) == mode]
cols = overlap(sorted(cols))
cols = [(c[0] + c[1]) / 2.0 for c in cols]
output = [['' for c in cols] for d in data]
for row, d in enumerate(data):
for t in d:
cog = (t.x0 + t.x1) / 2.0
diff = [(i, abs(cog - c)) for i, c in enumerate(cols)]
if diff:
idx = min(diff, key=lambda x: x[1])
else:
print "couldn't find a table on this page"
return None
if output[row][idx[0]]:
output[row][idx[0]] += ' ' + t.get_text().strip()
else:
output[row][idx[0]] = t.get_text().strip()
return output

View File

@ -4,14 +4,55 @@ from cell import Cell
class Table:
"""Table
Parameters
----------
columns : list
List of column x-coordinates.
rows : list
List of row y-coordinates.
Attributes
----------
cells : list
2-D list of cell objects.
columns : list
List of column x-coordinates.
rows : list
List of row y-coordinates.
"""
def __init__(self, columns, rows):
self.cells = [[Cell(c[0], r[1], c[1], r[0])
for c in columns] for r in rows]
self.columns = columns
self.rows = rows
def set_edges(self, vertical, horizontal, jtol):
def set_edges(self, vertical, horizontal, jtol=2):
"""Set cell edges to True if corresponding line segments
are detected in the pdf image.
Parameters
----------
vertical : list
List of vertical line segments.
horizontal : list
List of horizontal line segments.
jtol : int, default: 2, optional
Tolerance to account for when comparing joint and line
coordinates.
Returns
-------
self : object
Returns self.
"""
for v in vertical:
# find closest x coord
# iterate over y coords and find closest points
@ -117,6 +158,14 @@ class Table:
return self
def set_spanning(self):
"""Set spanning values of a cell to True if it isn't
bounded by four edges.
Returns
-------
self : object
Returns self.
"""
for i in range(len(self.cells)):
for j in range(len(self.cells[i])):
bound = self.cells[i][j].get_bounded_edges()
@ -125,28 +174,38 @@ class Table:
elif bound == 3:
if not self.cells[i][j].left:
if self.cells[i][j].right and self.cells[i][j].top and self.cells[i][j].bottom:
if (self.cells[i][j].right and
self.cells[i][j].top and
self.cells[i][j].bottom):
self.cells[i][j].spanning_h = True
elif not self.cells[i][j].right:
if self.cells[i][j].left and self.cells[i][j].top and self.cells[i][j].bottom:
if (self.cells[i][j].left and
self.cells[i][j].top and
self.cells[i][j].bottom):
self.cells[i][j].spanning_h = True
elif not self.cells[i][j].top:
if self.cells[i][j].left and self.cells[i][j].right and self.cells[i][j].bottom:
if (self.cells[i][j].left and
self.cells[i][j].right and
self.cells[i][j].bottom):
self.cells[i][j].spanning_v = True
elif not self.cells[i][j].bottom:
if self.cells[i][j].left and self.cells[i][j].right and self.cells[i][j].top:
if (self.cells[i][j].left and
self.cells[i][j].right and
self.cells[i][j].top):
self.cells[i][j].spanning_v = True
elif bound == 2:
if self.cells[i][j].left and self.cells[i][j].right:
if not self.cells[i][j].top and not self.cells[i][j].bottom:
if (not self.cells[i][j].top and
not self.cells[i][j].bottom):
self.cells[i][j].spanning_v = True
elif self.cells[i][j].top and self.cells[i][j].bottom:
if not self.cells[i][j].left and not self.cells[i][j].right:
if (not self.cells[i][j].left and
not self.cells[i][j].right):
self.cells[i][j].spanning_h = True
return self

154
utils.py
View File

@ -2,16 +2,61 @@ import numpy as np
def translate(x1, x2):
"""Translate coordinate x2 by x1.
Parameters
----------
x1 : float
x2 : float
Returns
-------
x2 : float
"""
x2 += x1
return x2
def scale(x, s):
"""Scale coordinate x by scaling factor s.
Parameters
----------
x : float
s : float
Returns
-------
x : float
"""
x *= s
return x
def rotate(x1, y1, x2, y2, angle):
"""Rotate point x2, y2 about point x1, y1 by angle.
Parameters
----------
x1 : float
y1 : float
x2 : float
y2 : float
angle : float
Angle in radians.
Returns
-------
xnew : float
ynew : float
"""
s = np.sin(angle)
c = np.cos(angle)
x2 = translate(-x1, x2)
@ -23,7 +68,20 @@ def rotate(x1, y1, x2, y2, angle):
return xnew, ynew
def remove_close_values(ar, mtol):
def remove_close_values(ar, mtol=2):
"""Remove values which are within a tolerance of mtol of another value
present in list.
Parameters
----------
ar : list
mtol : int, default: 2, optional
Returns
-------
ret : list
"""
ret = []
for a in ar:
if not ret:
@ -37,7 +95,20 @@ def remove_close_values(ar, mtol):
return ret
def merge_close_values(ar, mtol):
def merge_close_values(ar, mtol=2):
"""Merge values which are within a tolerance of mtol by calculating
a moving mean.
Parameters
----------
ar : list
mtol : int, default: 2, optional
Returns
-------
ret : list
"""
ret = []
for a in ar:
if not ret:
@ -53,18 +124,63 @@ def merge_close_values(ar, mtol):
def get_row_idx(t, rows):
"""Get index of the row in which the given object falls by
comparing their co-ordinates.
Parameters
----------
t : object
rows : list
Returns
-------
r : int
"""
for r in range(len(rows)):
if (t.y0 + t.y1) / 2.0 < rows[r][0] and (t.y0 + t.y1) / 2.0 > rows[r][1]:
return r
def get_column_idx(t, columns):
"""Get index of the column in which the given object falls by
comparing their co-ordinates.
Parameters
----------
t : object
columns : list
Returns
-------
c : int
"""
for c in range(len(columns)):
if (t.x0 + t.x1) / 2.0 > columns[c][0] and (t.x0 + t.x1) / 2.0 < columns[c][1]:
return c
def reduce_index(t, rotated, r_idx, c_idx):
"""Shift a text object if it lies within a spanning cell taking
in account table rotation.
Parameters
----------
t : object
rotated : string
r_idx : int
c_idx : int
Returns
-------
r_idx : int
c_idx : int
"""
if not rotated:
if t.cells[r_idx][c_idx].spanning_h:
while not t.cells[r_idx][c_idx].left:
@ -90,6 +206,16 @@ def reduce_index(t, rotated, r_idx, c_idx):
def outline(t):
"""Light up table boundary.
Parameters
----------
t : object
Returns
-------
t : object
"""
for i in range(len(t.cells)):
t.cells[i][0].left = True
t.cells[i][len(t.cells[i]) - 1].right = True
@ -99,7 +225,19 @@ def outline(t):
return t
def fill(t, f):
def fill(t, f=None):
"""Fill spanning cells.
Parameters
----------
t : object
f : string, default: None, optional
Returns
-------
t : object
"""
if f == "h":
for i in range(len(t.cells)):
for j in range(len(t.cells[i])):
@ -124,6 +262,16 @@ def fill(t, f):
def remove_empty(d):
"""Remove empty rows and columns.
Parameters
----------
d : list
Returns
-------
d : list
"""
for i, row in enumerate(d):
if row == [''] * len(row):
d.pop(i)