Modify command line tool
Precompute globs Replace argparse with docopt Fix CLI Update .gitignore Add docstrings Update README Fix typo Replace zip subprocess call Use tempfile Fix newlinepull/2/head
parent
3045a92630
commit
271d4cafd6
|
|
@ -1,3 +1,5 @@
|
|||
__pycache__/
|
||||
*.py[cod]
|
||||
*.so
|
||||
|
||||
.camelot/
|
||||
69
README.md
69
README.md
|
|
@ -14,63 +14,24 @@ camelot also uses poppler-utils, more specifically `pdfseparate` to separate a p
|
|||
|
||||
## Usage
|
||||
|
||||
python2 camelot.py [options] file
|
||||
<pre>
|
||||
camelot.py [options] <method> [<args>...]
|
||||
|
||||
positional arguments:
|
||||
options:
|
||||
-h, --help Show this screen.
|
||||
-v, --version Show version.
|
||||
-p, --pages <pageno> Comma-separated list of page numbers.
|
||||
Example: -p 1,3-6,10 [default: 1]
|
||||
-f, --format <format> Output format. (csv,xlsx) [default: csv]
|
||||
-l, --log Print log to file.
|
||||
-o, --output <directory> Output directory.
|
||||
|
||||
file
|
||||
camelot methods:
|
||||
lattice Looks for lines between data.
|
||||
stream Looks for spaces between data.
|
||||
|
||||
optional arguments:
|
||||
|
||||
-h, --help
|
||||
|
||||
show this help message and exit
|
||||
|
||||
-p, --pages PAGES [PAGES ...]
|
||||
|
||||
Specify the page numbers and/or page ranges to be
|
||||
parsed. Example: -p="1 3-5 9", -p="all" (default: 1)
|
||||
|
||||
-f, --format FORMAT
|
||||
|
||||
Output format (csv/xlsx). Example: -f="xlsx" (default: csv)
|
||||
|
||||
-m, --spreadsheet
|
||||
|
||||
Extract tables with ruling lines. (default: False)
|
||||
|
||||
-F, --fill FILL
|
||||
|
||||
Fill the values in empty cells horizontally(h) and/or
|
||||
vertically(v). Example: -F="h", -F="v", -F="hv" (default: None)
|
||||
|
||||
-s, --scale [SCALE]
|
||||
|
||||
Scaling factor. Large scaling factor leads to smaller
|
||||
lines being detected. (default: 15)
|
||||
|
||||
-j, --jtol [JTOL]
|
||||
|
||||
Tolerance to account for when comparing joint and line
|
||||
coordinates. (default: 2)
|
||||
|
||||
-M, --mtol [MTOL]
|
||||
|
||||
Tolerance to account for when merging lines which are
|
||||
very close. (default: 2)
|
||||
|
||||
-i, --invert
|
||||
|
||||
Make sure lines are in foreground. (default: False)
|
||||
|
||||
-d, --debug DEBUG
|
||||
|
||||
Debug by visualizing contours, lines, joints, tables.
|
||||
Example: --debug="contours"
|
||||
|
||||
-o, --output OUTPUT
|
||||
|
||||
Specify output directory.
|
||||
See 'camelot <method> -h' for more information on a specific method.
|
||||
</pre>
|
||||
|
||||
## Development
|
||||
|
||||
|
|
|
|||
80
basic.py
80
basic.py
|
|
@ -1,80 +0,0 @@
|
|||
import os
|
||||
import csv
|
||||
import numpy as np
|
||||
|
||||
from pdf import get_pdf_info
|
||||
|
||||
|
||||
def overlap(l):
|
||||
merged = []
|
||||
for higher in l:
|
||||
if not merged:
|
||||
merged.append(higher)
|
||||
else:
|
||||
lower = merged[-1]
|
||||
if higher[0] <= lower[1]:
|
||||
upper_bound = max(lower[1], higher[1])
|
||||
lower_bound = min(lower[0], higher[0])
|
||||
merged[-1] = (lower_bound, upper_bound)
|
||||
else:
|
||||
merged.append(higher)
|
||||
return merged
|
||||
|
||||
|
||||
def get_row_idx(t, rows):
|
||||
for r in range(len(rows)):
|
||||
if t.y1 <= rows[r][0] and t.y0 >= rows[r][1]:
|
||||
return r
|
||||
|
||||
|
||||
def get_column_idx(t, columns):
|
||||
for c in range(len(columns)):
|
||||
if t.x0 >= columns[c][0] and t.x1 <= columns[c][1]:
|
||||
return c
|
||||
|
||||
|
||||
def basic(pdf_dir, filename, char_margin, line_margin, word_margin):
|
||||
print "working on", filename
|
||||
text, _, _ = get_pdf_info(os.path.join(pdf_dir, filename), 'basic',
|
||||
char_margin, line_margin, word_margin)
|
||||
text.sort(key=lambda x: (-x.y0, x.x0))
|
||||
y_last = 0
|
||||
data = []
|
||||
temp = []
|
||||
elements = []
|
||||
for t in text:
|
||||
# is checking for upright necessary?
|
||||
# if t.get_text().strip() and all([obj.upright for obj in t._objs if
|
||||
# type(obj) is LTChar]):
|
||||
if t.get_text().strip():
|
||||
if not np.isclose(y_last, t.y0, atol=2):
|
||||
y_last = t.y0
|
||||
elements.append(len(temp))
|
||||
data.append(temp)
|
||||
temp = []
|
||||
temp.append(t)
|
||||
# a table can't have just 1 column, can it?
|
||||
elements = filter(lambda x: x != 1, elements)
|
||||
# mode = int(sys.argv[2]) if sys.argv[2] else max(set(elements), key=elements.count)
|
||||
mode = max(set(elements), key=elements.count)
|
||||
columns = [(t.x0, t.x1) for d in data for t in d if len(d) == mode]
|
||||
columns = overlap(sorted(columns))
|
||||
columns = [(c[0] + c[1]) / 2.0 for c in columns]
|
||||
|
||||
output = [['' for c in columns] for d in data]
|
||||
for row, d in enumerate(data):
|
||||
for t in d:
|
||||
cog = (t.x0 + t.x1) / 2.0
|
||||
diff = [(i, abs(cog - c)) for i, c in enumerate(columns)]
|
||||
idx = min(diff, key=lambda x: x[1])
|
||||
if output[row][idx[0]]:
|
||||
output[row][idx[0]] += ' ' + t.get_text().strip()
|
||||
else:
|
||||
output[row][idx[0]] = t.get_text().strip()
|
||||
|
||||
csvname = filename.split('.')[0] + '.csv'
|
||||
csvpath = os.path.join(pdf_dir, csvname)
|
||||
with open(csvpath, 'w') as outfile:
|
||||
writer = csv.writer(outfile, quoting=csv.QUOTE_ALL)
|
||||
for row in output:
|
||||
writer.writerow([cell.encode('utf-8') for cell in row])
|
||||
|
|
@ -1,73 +1,114 @@
|
|||
#!/usr/bin/env python2
|
||||
import os
|
||||
import re
|
||||
import csv
|
||||
import sys
|
||||
import glob
|
||||
import time
|
||||
import shutil
|
||||
import logging
|
||||
import zipfile
|
||||
import tempfile
|
||||
import subprocess
|
||||
import argparse
|
||||
from docopt import docopt
|
||||
from werkzeug.utils import secure_filename
|
||||
|
||||
from basic import basic
|
||||
from spreadsheet import spreadsheet
|
||||
from lattice import lattice
|
||||
from stream import stream
|
||||
|
||||
|
||||
doc = """
|
||||
camelot parses tables from PDFs!
|
||||
|
||||
usage:
|
||||
camelot.py [options] <method> [<args>...]
|
||||
|
||||
options:
|
||||
-h, --help Show this screen.
|
||||
-v, --version Show version.
|
||||
-p, --pages <pageno> Comma-separated list of page numbers.
|
||||
Example: -p 1,3-6,10 [default: 1]
|
||||
-f, --format <format> Output format. (csv,xlsx) [default: csv]
|
||||
-l, --log Print log to file.
|
||||
-o, --output <directory> Output directory.
|
||||
|
||||
camelot methods:
|
||||
lattice Looks for lines between data.
|
||||
stream Looks for spaces between data.
|
||||
|
||||
See 'camelot <method> -h' for more information on a specific method.
|
||||
"""
|
||||
|
||||
lattice_doc = """
|
||||
Lattice method looks for lines between data to form a table.
|
||||
|
||||
usage:
|
||||
camelot.py lattice [options] [--] <file>
|
||||
|
||||
options:
|
||||
-F, --fill <fill> Fill data in horizontal and/or vertical spanning
|
||||
cells. Example: -F h, -F v, -F hv
|
||||
-s, --scale <scale> Scaling factor. Large scaling factor leads to
|
||||
smaller lines being detected. [default: 15]
|
||||
-j, --jtol <jtol> Tolerance to account for when comparing joint
|
||||
and line coordinates. [default: 2]
|
||||
-m, --mtol <mtol> Tolerance to account for when merging lines
|
||||
which are very close. [default: 2]
|
||||
-i, --invert Invert pdf image to make sure that lines are
|
||||
in foreground.
|
||||
-d, --debug <debug> Debug by visualizing pdf geometry.
|
||||
(contour,line,joint,table) Example: -d table
|
||||
"""
|
||||
|
||||
stream_doc = """
|
||||
Stream method looks for spaces between data to form a table.
|
||||
|
||||
usage:
|
||||
camelot.py stream [options] [--] <file>
|
||||
|
||||
options:
|
||||
-n, --ncols <ncols> Number of columns. [default: 0]
|
||||
-c, --columns <columns> Comma-separated list of column x-coordinates.
|
||||
Example: -c 10.1,20.2,30.3
|
||||
-M, --cmargin <cmargin> Char margin. Chars closer than cmargin are
|
||||
grouped together to form a word. [default: 2.0]
|
||||
-L, --lmargin <lmargin> Line margin. Lines closer than lmargin are
|
||||
grouped together to form a textbox. [default: 0.5]
|
||||
-W, --wmargin <wmargin> Word margin. Insert blank spaces between chars
|
||||
if distance between words is greater than word
|
||||
margin. [default: 0.1]
|
||||
-d, --debug Debug by visualizing textboxes.
|
||||
"""
|
||||
|
||||
pno = re.compile(r'\d+')
|
||||
|
||||
|
||||
def mkdir(directory):
|
||||
if not os.path.isdir(directory):
|
||||
os.makedirs(directory)
|
||||
|
||||
|
||||
def filesort(filename):
|
||||
filename = filename.split('/')[-1]
|
||||
def filesort(filepath):
|
||||
filename = os.path.basename(filepath)
|
||||
num = pno.findall(filename)
|
||||
if len(num) == 2:
|
||||
return (int(num[0]), int(num[1]))
|
||||
else:
|
||||
return (int(num[0]), 0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
start_time = time.time()
|
||||
CAMELOT_DIR = '.camelot/'
|
||||
mkdir(CAMELOT_DIR)
|
||||
tmpdir = tempfile.mkdtemp()
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Parse tables from pdfs!', usage='python2 camelot.py [options] file')
|
||||
parser.add_argument('-p', '--pages', nargs='+', action='store', dest='pages',
|
||||
help='Specify the page numbers and/or page ranges to be parsed. Example: -p="1 3-5 9", -p="all" (default: 1)')
|
||||
parser.add_argument('-f', '--format', nargs=1, action='store', dest='format',
|
||||
help='Output format (csv/xlsx). Example: -f="xlsx" (default: csv)', default=["csv"])
|
||||
parser.add_argument('-s', '--spreadsheet', action='store_true', dest='spreadsheet',
|
||||
help='Extract tables with ruling lines. (default: False)')
|
||||
parser.add_argument('-i', '--fill', action='store', dest='fill',
|
||||
help='Fill the values in empty cells horizontally(h) and/or vertically(v). Example: -F="h", -F="v", -F="hv" (default: None)', default=None)
|
||||
parser.add_argument('-c', '--scale', nargs='?', action='store', dest='scale',
|
||||
help='Scaling factor. Large scaling factor leads to smaller lines being detected. (default: 15)', default=15, type=int)
|
||||
parser.add_argument('-j', '--jtol', nargs='?', action='store',
|
||||
dest='jtol', help='Tolerance to account for when comparing joint and line coordinates. (default: 2)', default=2, type=int)
|
||||
parser.add_argument('-t', '--mtol', nargs='?', action='store',
|
||||
dest='mtol', help='Tolerance to account for when merging lines which are very close. (default: 2)', default=2, type=int)
|
||||
parser.add_argument('-n', '--invert', action='store_true', dest='invert',
|
||||
help='Make sure lines are in foreground. (default: False)')
|
||||
parser.add_argument('-d', '--debug', nargs=1, action='store', dest='debug',
|
||||
help='Debug by visualizing contours, lines, joints, tables. Example: --debug="contours"')
|
||||
parser.add_argument('-M', '--char-margin', nargs='?', action='store', dest='char_margin',
|
||||
help='(default: 2.0)', default=2.0, type=float)
|
||||
parser.add_argument('-L', '--line-margin', nargs='?', action='store', dest='line_margin',
|
||||
help='(default: 0.5)', default=0.5, type=float)
|
||||
parser.add_argument('-W', '--word-margin', nargs='?', action='store', dest='word_margin',
|
||||
help='(default: 0.1)', default=0.1, type=float)
|
||||
parser.add_argument('-o', '--output', nargs=1, action='store', dest='output',
|
||||
help='Specify output directory.')
|
||||
parser.add_argument('file', nargs=1)
|
||||
args = docopt(doc, version='0.1', options_first=True)
|
||||
argv = [args['<method>']] + args['<args>']
|
||||
if args['<method>'] == 'lattice':
|
||||
args.update(docopt(lattice_doc, argv=argv))
|
||||
elif args['<method>'] == 'stream':
|
||||
args.update(docopt(stream_doc, argv=argv))
|
||||
|
||||
result = parser.parse_args()
|
||||
|
||||
if result.pages:
|
||||
if result.pages == ['all']:
|
||||
p = result.pages
|
||||
if args['--pages']:
|
||||
if args['--pages'] == ['all']:
|
||||
p = args['--pages']
|
||||
else:
|
||||
p = []
|
||||
for r in result.pages[0].split(' '):
|
||||
for r in args['--pages'].split(','):
|
||||
if '-' in r:
|
||||
a, b = r.split('-')
|
||||
a, b = int(a), int(b)
|
||||
|
|
@ -78,59 +119,140 @@ else:
|
|||
p = ['1']
|
||||
p = sorted(set(p))
|
||||
|
||||
filename = result.file[0].split('/')[-1]
|
||||
# pdf_dir = os.path.join(CAMELOT_DIR, os.urandom(16).encode('hex'))
|
||||
pdf_dir = os.path.join(CAMELOT_DIR, filename.split('.')[0])
|
||||
mkdir(pdf_dir)
|
||||
logging.basicConfig(filename=os.path.join(pdf_dir, filename.split('.')[
|
||||
0] + '.log'), filemode='w', level=logging.DEBUG)
|
||||
fname = os.path.basename(args['<file>'])
|
||||
fname = secure_filename(fname)
|
||||
fdir = os.path.dirname(args['<file>'])
|
||||
froot, fext = os.path.splitext(fname)
|
||||
if fext.lower() != '.pdf':
|
||||
print "camelot can parse only pdfs right now"
|
||||
sys.exit()
|
||||
|
||||
shutil.copy(result.file[0], os.path.join(pdf_dir, filename))
|
||||
logfname = os.path.join(tmpdir, froot + '.log')
|
||||
logging.basicConfig(filename=logfname, filemode='w', level=logging.DEBUG)
|
||||
|
||||
shutil.copy(args['<file>'], os.path.join(tmpdir, fname))
|
||||
print "separating pdf into pages"
|
||||
print
|
||||
if p == ['all']:
|
||||
subprocess.call(['pdfseparate', os.path.join(
|
||||
pdf_dir, filename), os.path.join(pdf_dir, 'pg-%d.pdf')])
|
||||
subprocess.call(['pdfseparate', os.path.join(tmpdir, fname), os.path.join(tmpdir,
|
||||
'pg-%d.pdf')])
|
||||
else:
|
||||
for page in p:
|
||||
subprocess.call(['pdfseparate', '-f', page, '-l', page, os.path.join(
|
||||
pdf_dir, filename), os.path.join(pdf_dir, 'pg-' + page + '.pdf')])
|
||||
subprocess.call(['pdfseparate', '-f', page, '-l', page, os.path.join(tmpdir, fname),
|
||||
os.path.join(tmpdir, 'pg-%s.pdf' % page)])
|
||||
|
||||
if result.spreadsheet:
|
||||
print "using the spreadsheet method"
|
||||
for g in sorted(glob.glob(os.path.join(pdf_dir, 'pg-*.pdf'))):
|
||||
print "converting", g.split('/')[-1], "to image"
|
||||
glob_pdf = sorted(glob.glob(os.path.join(tmpdir, 'pg-*.pdf')))
|
||||
if args['<method>'] == 'lattice':
|
||||
print "using the lattice method"
|
||||
for g in glob_pdf:
|
||||
g_fname = os.path.basename(g)
|
||||
g_froot, __ = os.path.splitext(g)
|
||||
print "converting %s to image" % g_fname
|
||||
os.system(' '.join(['convert', '-density', '300',
|
||||
g, '-depth', '8', g[:-4] + '.png']))
|
||||
g, '-depth', '8', g_froot + '.png']))
|
||||
try:
|
||||
spreadsheet(pdf_dir, g.split('/')[-1], result.fill, result.scale,
|
||||
result.jtol, result.mtol, result.invert, result.debug,
|
||||
result.char_margin, result.line_margin, result.word_margin)
|
||||
except:
|
||||
logging.error("Couldn't parse " + g.split('/')[-1])
|
||||
print "Couldn't parse", g.split('/')[-1]
|
||||
else:
|
||||
print "using the basic method"
|
||||
for g in sorted(glob.glob(os.path.join(pdf_dir, 'pg-*.pdf'))):
|
||||
basic(pdf_dir, g.split('/')[-1], result.char_margin, result.line_margin, result.word_margin)
|
||||
data = lattice(g, f=args['--fill'], s=int(args['--scale']),
|
||||
jtol=int(args['--jtol']), mtol=int(args['--mtol']),
|
||||
invert=args['--invert'], debug=args['--debug'])
|
||||
if data is None:
|
||||
print
|
||||
print "See 'camelot lattice -h' for various parameters you can tweak."
|
||||
sys.exit()
|
||||
for k in sorted(data.keys()):
|
||||
csvfile = g_froot + '_%s.csv' % k
|
||||
with open(csvfile, 'w') as outfile:
|
||||
writer = csv.writer(outfile)
|
||||
for d in data[k]:
|
||||
writer.writerow([c.encode('utf-8') for c in d])
|
||||
print "saved as", os.path.basename(csvfile)
|
||||
print
|
||||
except Exception:
|
||||
logging.exception("")
|
||||
print "couldn't parse", g_fname, "see log for more info"
|
||||
print
|
||||
elif args['<method>'] == 'stream':
|
||||
print "using the stream method"
|
||||
for g in glob_pdf:
|
||||
g_fname = os.path.basename(g)
|
||||
g_froot, __ = os.path.splitext(g)
|
||||
try:
|
||||
data = stream(g, ncolumns=int(args['--ncols']), columns=args['--columns'],
|
||||
char_margin=float(args['--cmargin']),
|
||||
line_margin=float(args['--lmargin']),
|
||||
word_margin=float(args['--wmargin']),
|
||||
debug=args['--debug'])
|
||||
if data is None:
|
||||
print
|
||||
print "See 'camelot stream -h' for various parameters you can tweak."
|
||||
sys.exit()
|
||||
csvfile = g_froot + '.csv'
|
||||
with open(csvfile, 'w') as outfile:
|
||||
writer = csv.writer(outfile)
|
||||
for d in data:
|
||||
writer.writerow([c.encode('utf-8') for c in d])
|
||||
print "saved as", os.path.basename(csvfile)
|
||||
print
|
||||
except Exception:
|
||||
logging.exception("")
|
||||
print "couldn't parse", g_fname, "see log for more info"
|
||||
print
|
||||
|
||||
if result.format == ['xlsx']:
|
||||
import csv
|
||||
glob_csv = sorted(glob.glob(os.path.join(tmpdir, '*.csv')), key=filesort)
|
||||
if args['--format'] == 'csv':
|
||||
if len(glob_csv) == 1:
|
||||
if args['--output']:
|
||||
shutil.copy(glob_csv[0], args['--output'])
|
||||
if args['--log']:
|
||||
shutil.copy(logfname, args['--output'])
|
||||
else:
|
||||
shutil.copy(glob_csv[0], fdir)
|
||||
if args['--log']:
|
||||
shutil.copy(zippath, fdir)
|
||||
else:
|
||||
zipname = froot + '.zip'
|
||||
zippath = os.path.join(tmpdir, zipname)
|
||||
print "zipping 'em up"
|
||||
with zipfile.ZipFile(zippath, 'a', zipfile.ZIP_DEFLATED) as myzip:
|
||||
for g in glob_csv:
|
||||
myzip.write(g, os.path.join(froot, os.path.basename(g)))
|
||||
if args['--output']:
|
||||
shutil.copy(zippath, args['--output'])
|
||||
if args['--log']:
|
||||
shutil.copy(logfname, args['--output'])
|
||||
else:
|
||||
shutil.copy(zippath, fdir)
|
||||
if args['--log']:
|
||||
shutil.copy(zippath, fdir)
|
||||
print
|
||||
elif args['--format'] == 'xlsx':
|
||||
from pyexcel_xlsx import save_data
|
||||
from collections import OrderedDict
|
||||
data = OrderedDict()
|
||||
for c in sorted(glob.glob(os.path.join(pdf_dir, '*.csv')), key=filesort):
|
||||
print "adding", c.split('/')[-1], "to excel file"
|
||||
for c in glob_csv:
|
||||
c_fname = os.path.basename(c)
|
||||
c_froot, __ = os.path.splitext(c)
|
||||
print "adding", c_fname, "to excel file"
|
||||
with open(c, 'r') as csvfile:
|
||||
reader = csv.reader(csvfile)
|
||||
data.update({c.split('/')[-1].split('.')
|
||||
[0]: [row for row in reader]})
|
||||
xlsxname = filename.split('.')[0] + '.xlsx'
|
||||
xlsxpath = os.path.join(pdf_dir, xlsxname)
|
||||
c_froot, __ = os.path.splitext(c_fname)
|
||||
data.update({c_froot: [row for row in reader]})
|
||||
xlsxname = froot + '.xlsx'
|
||||
xlsxpath = os.path.join(tmpdir, xlsxname)
|
||||
save_data(xlsxpath, data)
|
||||
if args['--output']:
|
||||
shutil.copy(xlsxpath, args['--output'])
|
||||
if args['--log']:
|
||||
shutil.copy(logfname, args['--output'])
|
||||
else:
|
||||
shutil.copy(xlsxpath, fdir)
|
||||
if args['--log']:
|
||||
shutil.copy(zippath, fdir)
|
||||
print
|
||||
print "saved as", xlsxname
|
||||
|
||||
print "cleaning up..."
|
||||
shutil.rmtree(tmpdir)
|
||||
|
||||
print "finished in", time.time() - start_time, "seconds"
|
||||
logging.info("Time taken for " + filename + ": " +
|
||||
logging.info("Time taken for " + fname + ": " +
|
||||
str(time.time() - start_time) + " seconds")
|
||||
|
|
|
|||
58
cell.py
58
cell.py
|
|
@ -1,6 +1,44 @@
|
|||
class Cell:
|
||||
"""Cell
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x1 : int
|
||||
|
||||
y1 : int
|
||||
|
||||
x2 : int
|
||||
|
||||
y2 : int
|
||||
|
||||
Attributes
|
||||
----------
|
||||
lb : tuple
|
||||
|
||||
lt : tuple
|
||||
|
||||
rb : tuple
|
||||
|
||||
rt : tuple
|
||||
|
||||
bbox : tuple
|
||||
|
||||
left : bool
|
||||
|
||||
right : bool
|
||||
|
||||
top : bool
|
||||
|
||||
bottom : bool
|
||||
|
||||
text : string
|
||||
|
||||
spanning_h : bool
|
||||
|
||||
spanning_v : bool
|
||||
"""
|
||||
def __init__(self, x1, y1, x2, y2):
|
||||
|
||||
self.lb = (x1, y1)
|
||||
self.lt = (x1, y2)
|
||||
self.rb = (x2, y1)
|
||||
|
|
@ -15,10 +53,28 @@ class Cell:
|
|||
self.spanning_v = False
|
||||
|
||||
def add_text(self, text):
|
||||
self.text += text
|
||||
"""Add text to cell object.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
text : string
|
||||
"""
|
||||
self.text = ''.join([self.text, text])
|
||||
|
||||
def get_text(self):
|
||||
"""Get text from cell object.
|
||||
|
||||
Returns
|
||||
-------
|
||||
text : string
|
||||
"""
|
||||
return self.text
|
||||
|
||||
def get_bounded_edges(self):
|
||||
"""Get number of edges by which a cell is bounded.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bounded_edges : int
|
||||
"""
|
||||
return self.top + self.bottom + self.left + self.right
|
||||
|
|
|
|||
|
|
@ -1,37 +1,173 @@
|
|||
import os
|
||||
import csv
|
||||
import cv2
|
||||
import glob
|
||||
import numpy as np
|
||||
|
||||
from table import Table
|
||||
from pdf import get_pdf_info
|
||||
from morph_transform import morph_transform
|
||||
from utils import (translate, scale, merge_close_values, get_row_idx,
|
||||
get_column_idx, reduce_index, outline, fill, remove_empty)
|
||||
|
||||
|
||||
def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug,
|
||||
char_margin, line_margin, word_margin):
|
||||
def morph_transform(img, s=15, invert=False):
|
||||
"""Morphological Transformation
|
||||
|
||||
Applies a series of morphological operations on the image
|
||||
to find table contours and line segments.
|
||||
http://answers.opencv.org/question/63847/how-to-extract-tables-from-an-image/
|
||||
|
||||
Empirical result for adaptiveThreshold's blockSize=5 and C=-0.2
|
||||
taken from http://pequan.lip6.fr/~bereziat/pima/2012/seuillage/sezgin04.pdf
|
||||
|
||||
Parameters
|
||||
----------
|
||||
img : ndarray
|
||||
|
||||
s : int, default: 15, optional
|
||||
Scaling factor. Large scaling factor leads to smaller lines
|
||||
being detected.
|
||||
|
||||
invert : bool, default: False, optional
|
||||
Invert pdf image to make sure that lines are in foreground.
|
||||
|
||||
Returns
|
||||
-------
|
||||
tables : dict
|
||||
Dictionary with table bounding box as key and list of
|
||||
joints found in the table as value.
|
||||
|
||||
v_segments : list
|
||||
List of vertical line segments found in the image.
|
||||
|
||||
h_segments : list
|
||||
List of horizontal line segments found in the image.
|
||||
"""
|
||||
img_x, img_y = img.shape[1], img.shape[0]
|
||||
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||||
|
||||
if invert:
|
||||
threshold = cv2.adaptiveThreshold(
|
||||
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -0.2)
|
||||
else:
|
||||
threshold = cv2.adaptiveThreshold(np.invert(
|
||||
gray), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -0.2)
|
||||
vertical = threshold
|
||||
horizontal = threshold
|
||||
|
||||
scale = s
|
||||
verticalsize = vertical.shape[0] / scale
|
||||
horizontalsize = horizontal.shape[1] / scale
|
||||
|
||||
ver = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize))
|
||||
hor = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontalsize, 1))
|
||||
|
||||
vertical = cv2.erode(vertical, ver, (-1, -1))
|
||||
vertical = cv2.dilate(vertical, ver, (-1, -1))
|
||||
|
||||
horizontal = cv2.erode(horizontal, hor, (-1, -1))
|
||||
horizontal = cv2.dilate(horizontal, hor, (-1, -1))
|
||||
|
||||
mask = vertical + horizontal
|
||||
joints = np.bitwise_and(vertical, horizontal)
|
||||
__, contours, __ = cv2.findContours(
|
||||
mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
|
||||
|
||||
tables = {}
|
||||
for c in contours:
|
||||
c_poly = cv2.approxPolyDP(c, 3, True)
|
||||
x, y, w, h = cv2.boundingRect(c_poly)
|
||||
# find number of non-zero values in joints using what boundingRect
|
||||
# returns
|
||||
roi = joints[y : y + h, x : x + w]
|
||||
__, jc, __ = cv2.findContours(
|
||||
roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
|
||||
if len(jc) <= 4: # remove contours with less than <=4 joints
|
||||
continue
|
||||
joint_coords = []
|
||||
for j in jc:
|
||||
jx, jy, jw, jh = cv2.boundingRect(j)
|
||||
c1, c2 = x + (2 * jx + jw) / 2, y + (2 * jy + jh) / 2
|
||||
joint_coords.append((c1, c2))
|
||||
tables[(x, y + h, x + w, y)] = joint_coords
|
||||
|
||||
v_segments, h_segments = [], []
|
||||
_, vcontours, _ = cv2.findContours(
|
||||
vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
for vc in vcontours:
|
||||
x, y, w, h = cv2.boundingRect(vc)
|
||||
x1, x2 = x, x + w
|
||||
y1, y2 = y, y + h
|
||||
v_segments.append(((x1 + x2) / 2, y2, (x1 + x2) / 2, y1))
|
||||
|
||||
_, hcontours, _ = cv2.findContours(
|
||||
horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
for hc in hcontours:
|
||||
x, y, w, h = cv2.boundingRect(hc)
|
||||
x1, x2 = x, x + w
|
||||
y1, y2 = y, y + h
|
||||
h_segments.append((x1, (y1 + y2) / 2, x2, (y1 + y2) / 2))
|
||||
|
||||
return tables, v_segments, h_segments
|
||||
|
||||
|
||||
def lattice(filepath, f=None, s=15, jtol=2, mtol=2, invert=False, debug=None):
|
||||
"""Lattice algorithm
|
||||
|
||||
Makes table using pdf geometry information returned by
|
||||
morph_transform and fills data returned by PDFMiner in table cells.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filepath : string
|
||||
|
||||
f : string, default: None, optional
|
||||
Fill data in horizontal and/or vertical spanning
|
||||
cells. ('h', 'v', 'hv')
|
||||
|
||||
s : int, default: 15, optional
|
||||
Scaling factor. Large scaling factor leads to smaller lines
|
||||
being detected.
|
||||
|
||||
jtol : int, default: 2, optional
|
||||
Tolerance to account for when comparing joint and line
|
||||
coordinates.
|
||||
|
||||
mtol : int, default: 2, optional
|
||||
Tolerance to account for when merging lines which are
|
||||
very close.
|
||||
|
||||
invert : bool, default: False, optional
|
||||
Invert pdf image to make sure that lines are in foreground.
|
||||
|
||||
debug : string
|
||||
Debug by visualizing pdf geometry.
|
||||
('contour', 'line', 'joint', 'table')
|
||||
Returns
|
||||
-------
|
||||
output : dict
|
||||
Dictionary with table number as key and list of data as value.
|
||||
"""
|
||||
if debug:
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib.patches as patches
|
||||
filename = os.path.basename(filepath)
|
||||
print "working on", filename
|
||||
imagename = os.path.join(pdf_dir, filename.split('.')[0] + '.png')
|
||||
fileroot, __ = os.path.splitext(filepath)
|
||||
imagename = fileroot + '.png'
|
||||
img = cv2.imread(imagename)
|
||||
img_x, img_y = img.shape[1], img.shape[0]
|
||||
text, pdf_x, pdf_y = get_pdf_info(
|
||||
os.path.join(pdf_dir, filename), 'spreadsheet',
|
||||
char_margin, line_margin, word_margin)
|
||||
text, pdf_x, pdf_y = get_pdf_info(filepath, method='lattice')
|
||||
scaling_factor_x = pdf_x / float(img_x)
|
||||
scaling_factor_y = pdf_y / float(img_y)
|
||||
tables, v_segments, h_segments = morph_transform(imagename, s, invert)
|
||||
tables, v_segments, h_segments = morph_transform(img, s=s, invert=invert)
|
||||
|
||||
if debug == ["contours"]:
|
||||
if debug == "contour":
|
||||
for t in tables.keys():
|
||||
cv2.rectangle(img, (t[0], t[1]), (t[2], t[3]), (255, 0, 0), 3)
|
||||
plt.imshow(img)
|
||||
if debug == ["joints"]:
|
||||
plt.show()
|
||||
return None
|
||||
if debug == "joint":
|
||||
x_coord = []
|
||||
y_coord = []
|
||||
for k in tables.keys():
|
||||
|
|
@ -42,6 +178,8 @@ def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug,
|
|||
plt.plot(x_coord, y_coord, 'ro')
|
||||
plt.axis([0, max_x + 100, max_y + 100, 0])
|
||||
plt.imshow(img)
|
||||
plt.show()
|
||||
return None
|
||||
|
||||
# detect if vertical
|
||||
num_v = [t for t in text if (not t.upright) and t.get_text().strip()]
|
||||
|
|
@ -80,7 +218,8 @@ def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug,
|
|||
abs(translate(-img_y, h[3])), scaling_factor_y)
|
||||
h_segments_new.append((x1, y1, x2, y2))
|
||||
|
||||
num_tables = 0
|
||||
num_tables = 1
|
||||
output = {}
|
||||
# sort tables based on y-coord
|
||||
for k in sorted(tables_new.keys(), key=lambda x: x[1], reverse=True):
|
||||
# find rows and columns that lie in table
|
||||
|
|
@ -91,19 +230,21 @@ def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug,
|
|||
h_s = [h for h in h_segments_new if h[0] > lb[0] - 2 and h[2]
|
||||
< rt[0] + 2 and lb[1] - 2 <= h[1] <= rt[1] + 2]
|
||||
|
||||
if debug == ["lines"]:
|
||||
if debug == "line":
|
||||
for v in v_s:
|
||||
plt.plot([v[0], v[2]], [v[1], v[3]])
|
||||
for h in h_s:
|
||||
plt.plot([h[0], h[2]], [h[1], h[3]])
|
||||
plt.show()
|
||||
return None
|
||||
|
||||
columns, rows = zip(*tables_new[k])
|
||||
columns, rows = list(columns), list(rows)
|
||||
columns.extend([lb[0], rt[0]])
|
||||
rows.extend([lb[1], rt[1]])
|
||||
# sort horizontal and vertical segments
|
||||
columns = merge_close_values(sorted(columns), mtol)
|
||||
rows = merge_close_values(sorted(rows, reverse=True), mtol)
|
||||
columns = merge_close_values(sorted(columns), mtol=mtol)
|
||||
rows = merge_close_values(sorted(rows, reverse=True), mtol=mtol)
|
||||
# make grid using x and y coord of shortlisted rows and columns
|
||||
columns = [(columns[i], columns[i + 1])
|
||||
for i in range(0, len(columns) - 1)]
|
||||
|
|
@ -111,13 +252,13 @@ def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug,
|
|||
|
||||
table = Table(columns, rows)
|
||||
# light up cell edges
|
||||
table = table.set_edges(v_s, h_s, jtol)
|
||||
table = table.set_edges(v_s, h_s, jtol=jtol)
|
||||
# table set span method
|
||||
table = table.set_spanning()
|
||||
# TODO
|
||||
# light up table border
|
||||
table = outline(table)
|
||||
|
||||
if debug == ["tables"]:
|
||||
if debug == "table":
|
||||
for i in range(len(table.cells)):
|
||||
for j in range(len(table.cells[i])):
|
||||
if table.cells[i][j].left:
|
||||
|
|
@ -132,8 +273,8 @@ def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug,
|
|||
if table.cells[i][j].bottom:
|
||||
plt.plot([table.cells[i][j].lb[0], table.cells[i][j].rb[0]],
|
||||
[table.cells[i][j].lb[1], table.cells[i][j].rb[1]])
|
||||
if debug:
|
||||
plt.show()
|
||||
return None
|
||||
|
||||
# fill text after sorting it
|
||||
if not rotated:
|
||||
|
|
@ -152,8 +293,8 @@ def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug,
|
|||
r_idx, c_idx = reduce_index(table, rotated, r_idx, c_idx)
|
||||
table.cells[r_idx][c_idx].add_text(t.get_text().strip('\n'))
|
||||
|
||||
if fill:
|
||||
table = fill(table, fill)
|
||||
if f is not None:
|
||||
table = fill(table, f=f)
|
||||
|
||||
data = []
|
||||
for i in range(len(table.cells)):
|
||||
|
|
@ -165,13 +306,7 @@ def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug,
|
|||
data = zip(*data[::1])
|
||||
data.reverse()
|
||||
data = remove_empty(data)
|
||||
csvname = filename.split(
|
||||
'.')[0] + ('_table_%d' % (num_tables + 1)) + '.csv'
|
||||
csvpath = os.path.join(pdf_dir, csvname)
|
||||
with open(csvpath, 'w') as outfile:
|
||||
writer = csv.writer(outfile, quoting=csv.QUOTE_ALL)
|
||||
for d in data:
|
||||
writer.writerow(d)
|
||||
print "saved as", csvname
|
||||
print
|
||||
output['table_%d' % num_tables] = data
|
||||
num_tables += 1
|
||||
|
||||
return output
|
||||
|
|
@ -1,75 +0,0 @@
|
|||
import cv2
|
||||
import numpy as np
|
||||
|
||||
|
||||
def morph_transform(imagename, s, invert):
|
||||
# http://answers.opencv.org/question/63847/how-to-extract-tables-from-an-image/
|
||||
img = cv2.imread(imagename)
|
||||
img_x, img_y = img.shape[1], img.shape[0]
|
||||
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||||
# empirical result taken from
|
||||
# http://pequan.lip6.fr/~bereziat/pima/2012/seuillage/sezgin04.pdf
|
||||
if invert:
|
||||
threshold = cv2.adaptiveThreshold(
|
||||
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -0.2)
|
||||
else:
|
||||
threshold = cv2.adaptiveThreshold(np.invert(
|
||||
gray), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -0.2)
|
||||
vertical = threshold
|
||||
horizontal = threshold
|
||||
|
||||
scale = s
|
||||
verticalsize = vertical.shape[0] / scale
|
||||
horizontalsize = horizontal.shape[1] / scale
|
||||
|
||||
ver = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize))
|
||||
hor = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontalsize, 1))
|
||||
|
||||
vertical = cv2.erode(vertical, ver, (-1, -1))
|
||||
vertical = cv2.dilate(vertical, ver, (-1, -1))
|
||||
|
||||
horizontal = cv2.erode(horizontal, hor, (-1, -1))
|
||||
horizontal = cv2.dilate(horizontal, hor, (-1, -1))
|
||||
|
||||
mask = vertical + horizontal
|
||||
joints = np.bitwise_and(vertical, horizontal)
|
||||
_, contours, _ = cv2.findContours(
|
||||
mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
|
||||
|
||||
tables = {}
|
||||
for c in contours:
|
||||
c_poly = cv2.approxPolyDP(c, 3, True)
|
||||
x, y, w, h = cv2.boundingRect(c_poly)
|
||||
# find number of non-zero values in joints using what boundingRect
|
||||
# returns
|
||||
roi = joints[y:y + h, x:x + w]
|
||||
_, jc, _ = cv2.findContours(
|
||||
roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
|
||||
if len(jc) <= 4: # remove contours with less than <=4 joints
|
||||
continue
|
||||
joint_coords = []
|
||||
for j in jc:
|
||||
jx, jy, jw, jh = cv2.boundingRect(j)
|
||||
c1, c2 = x + (2 * jx + jw) / 2, y + (2 * jy + jh) / 2
|
||||
joint_coords.append((c1, c2))
|
||||
tables[(x, y + h, x + w, y)] = joint_coords
|
||||
|
||||
v_segments, h_segments = [], []
|
||||
_, vcontours, _ = cv2.findContours(
|
||||
vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
for vc in vcontours:
|
||||
x, y, w, h = cv2.boundingRect(vc)
|
||||
x1, x2 = x, x + w
|
||||
y1, y2 = y, y + h
|
||||
v_segments.append(((x1 + x2) / 2, y2, (x1 + x2) / 2, y1))
|
||||
|
||||
_, hcontours, _ = cv2.findContours(
|
||||
horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
for hc in hcontours:
|
||||
x, y, w, h = cv2.boundingRect(hc)
|
||||
x1, x2 = x, x + w
|
||||
y1, y2 = y, y + h
|
||||
h_segments.append((x1, (y1 + y2) / 2, x2, (y1 + y2) / 2))
|
||||
|
||||
return tables, v_segments, h_segments
|
||||
73
pdf.py
73
pdf.py
|
|
@ -9,35 +9,86 @@ from pdfminer.converter import PDFPageAggregator
|
|||
from pdfminer.layout import LAParams, LTChar, LTTextLineHorizontal
|
||||
|
||||
|
||||
def parse_text_basic(layout, t=None):
|
||||
def parse_text_stream(layout, t=None):
|
||||
"""Recursively parse pdf layout to get a list of
|
||||
LTTextHorizontal objects.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
layout : object
|
||||
|
||||
t : list
|
||||
|
||||
Returns
|
||||
-------
|
||||
t : list
|
||||
"""
|
||||
if t is None:
|
||||
t = []
|
||||
try:
|
||||
for obj in layout._objs:
|
||||
if type(obj) is LTTextLineHorizontal:
|
||||
if isinstance(obj, LTTextLineHorizontal):
|
||||
t.append(obj)
|
||||
else:
|
||||
t += parse_text_basic(obj)
|
||||
t += parse_text_stream(obj)
|
||||
except AttributeError:
|
||||
pass
|
||||
return t
|
||||
|
||||
|
||||
def parse_text_spreadsheet(layout, t=None):
|
||||
def parse_text_lattice(layout, t=None):
|
||||
"""Recursively parse pdf layout to get a list of
|
||||
LTChar objects.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
layout : object
|
||||
|
||||
t : list
|
||||
|
||||
Returns
|
||||
-------
|
||||
t : list
|
||||
"""
|
||||
if t is None:
|
||||
t = []
|
||||
try:
|
||||
for obj in layout._objs:
|
||||
if type(obj) is LTChar:
|
||||
if isinstance(obj, LTChar):
|
||||
t.append(obj)
|
||||
else:
|
||||
t += parse_text_spreadsheet(obj)
|
||||
t += parse_text_lattice(obj)
|
||||
except AttributeError:
|
||||
pass
|
||||
return t
|
||||
|
||||
|
||||
def get_pdf_info(pdfname, method, char_margin, line_margin, word_margin):
|
||||
def get_pdf_info(pdfname, method=None, char_margin=2.0, line_margin=0.5,
|
||||
word_margin=0.1):
|
||||
"""Get list of text objects along with pdf width and height.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
pdfname : string
|
||||
|
||||
method : string
|
||||
|
||||
char_margin : float
|
||||
|
||||
line_margin : float
|
||||
|
||||
word_margin : float
|
||||
|
||||
Returns
|
||||
-------
|
||||
text : list
|
||||
|
||||
pdf_x : int
|
||||
|
||||
pdf_y : int
|
||||
"""
|
||||
if not method:
|
||||
return None
|
||||
with open(pdfname, 'r') as f:
|
||||
parser = PDFParser(f)
|
||||
document = PDFDocument(parser)
|
||||
|
|
@ -52,9 +103,9 @@ def get_pdf_info(pdfname, method, char_margin, line_margin, word_margin):
|
|||
for page in PDFPage.create_pages(document):
|
||||
interpreter.process_page(page)
|
||||
layout = device.get_result()
|
||||
if method == 'basic':
|
||||
text = parse_text_basic(layout)
|
||||
elif method == 'spreadsheet':
|
||||
text = parse_text_spreadsheet(layout)
|
||||
if method == 'stream':
|
||||
text = parse_text_stream(layout)
|
||||
elif method == 'lattice':
|
||||
text = parse_text_lattice(layout)
|
||||
pdf_x, pdf_y = layout.bbox[2], layout.bbox[3]
|
||||
return text, pdf_x, pdf_y
|
||||
|
|
|
|||
|
|
@ -0,0 +1,143 @@
|
|||
import os
|
||||
import numpy as np
|
||||
|
||||
from pdf import get_pdf_info
|
||||
|
||||
|
||||
def overlap(l):
|
||||
"""Groups overlapping columns and returns list with updated
|
||||
columns boundaries.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
l : list
|
||||
List of column x-coordinates.
|
||||
|
||||
Returns
|
||||
-------
|
||||
merged : list
|
||||
List of merged column x-coordinates.
|
||||
"""
|
||||
merged = []
|
||||
for higher in l:
|
||||
if not merged:
|
||||
merged.append(higher)
|
||||
else:
|
||||
lower = merged[-1]
|
||||
if higher[0] <= lower[1]:
|
||||
upper_bound = max(lower[1], higher[1])
|
||||
lower_bound = min(lower[0], higher[0])
|
||||
merged[-1] = (lower_bound, upper_bound)
|
||||
else:
|
||||
merged.append(higher)
|
||||
return merged
|
||||
|
||||
|
||||
def stream(filepath, ncolumns=0, columns=None, char_margin=2.0,
|
||||
line_margin=0.5, word_margin=0.1, debug=False):
|
||||
"""Stream algorithm
|
||||
|
||||
Groups data returned by PDFMiner into rows and finds mode of the
|
||||
number of elements in each row to guess number of columns.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filepath : string
|
||||
|
||||
ncolumns : int, default: 0, optional
|
||||
Number of columns.
|
||||
|
||||
columns : string, default: None, optional
|
||||
Comma-separated list of column x-coordinates.
|
||||
|
||||
char_margin : float, default: 2.0, optional
|
||||
Char margin. Chars closer than cmargin are grouped together
|
||||
to form a word.
|
||||
|
||||
line_margin : float, default: 0.5, optional
|
||||
Line margin. Lines closer than lmargin are grouped together
|
||||
to form a textbox.
|
||||
|
||||
word_margin : float, default: 0.1, optional
|
||||
Word margin. Insert blank spaces between chars if distance
|
||||
between words is greater than word margin.
|
||||
|
||||
debug : bool, default: False, optional
|
||||
Debug by visualizing textboxes.
|
||||
|
||||
Returns
|
||||
-------
|
||||
output : list
|
||||
"""
|
||||
filename = os.path.basename(filepath)
|
||||
print "working on", filename
|
||||
text, __, __ = get_pdf_info(filepath, method='stream', char_margin=char_margin,
|
||||
line_margin=line_margin, word_margin=word_margin)
|
||||
text.sort(key=lambda x: (-x.y0, x.x0))
|
||||
y_last = 0
|
||||
data = []
|
||||
temp = []
|
||||
elements = []
|
||||
for t in text:
|
||||
# is checking for upright necessary?
|
||||
# if t.get_text().strip() and all([obj.upright for obj in t._objs if
|
||||
# type(obj) is LTChar]):
|
||||
if t.get_text().strip():
|
||||
if not np.isclose(y_last, t.y0, atol=2):
|
||||
y_last = t.y0
|
||||
elements.append(len(temp))
|
||||
data.append(temp)
|
||||
temp = []
|
||||
temp.append(t)
|
||||
|
||||
if debug:
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib.patches as patches
|
||||
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111, aspect='equal')
|
||||
xs, ys = [], []
|
||||
for d in data:
|
||||
for t in d:
|
||||
xs.extend([t.x0, t.x1])
|
||||
ys.extend([t.y0, t.y1])
|
||||
ax.add_patch(
|
||||
patches.Rectangle(
|
||||
(t.x0, t.y0),
|
||||
t.x1 - t.x0,
|
||||
t.y1 - t.y0
|
||||
)
|
||||
)
|
||||
ax.set_xlim(min(xs) - 10, max(xs) + 10)
|
||||
ax.set_ylim(min(ys) - 10, max(ys) + 10)
|
||||
plt.show()
|
||||
return None
|
||||
|
||||
if columns:
|
||||
cols = [(float(columns[i]), float(columns[i + 1]))
|
||||
for i in range(0, len(columns) - 1)]
|
||||
cols = [(c[0] + c[1]) / 2.0 for c in cols]
|
||||
else:
|
||||
# a table can't have just 1 column, can it?
|
||||
elements = filter(lambda x: x != 1, elements)
|
||||
mode = ncolumns if ncolumns else max(set(elements), key=elements.count)
|
||||
cols = [(t.x0, t.x1) for d in data for t in d if len(d) == mode]
|
||||
cols = overlap(sorted(cols))
|
||||
cols = [(c[0] + c[1]) / 2.0 for c in cols]
|
||||
|
||||
output = [['' for c in cols] for d in data]
|
||||
for row, d in enumerate(data):
|
||||
for t in d:
|
||||
cog = (t.x0 + t.x1) / 2.0
|
||||
diff = [(i, abs(cog - c)) for i, c in enumerate(cols)]
|
||||
if diff:
|
||||
idx = min(diff, key=lambda x: x[1])
|
||||
else:
|
||||
print "couldn't find a table on this page"
|
||||
return None
|
||||
if output[row][idx[0]]:
|
||||
output[row][idx[0]] += ' ' + t.get_text().strip()
|
||||
else:
|
||||
output[row][idx[0]] = t.get_text().strip()
|
||||
|
||||
return output
|
||||
73
table.py
73
table.py
|
|
@ -4,14 +4,55 @@ from cell import Cell
|
|||
|
||||
|
||||
class Table:
|
||||
"""Table
|
||||
|
||||
Parameters
|
||||
----------
|
||||
columns : list
|
||||
List of column x-coordinates.
|
||||
|
||||
rows : list
|
||||
List of row y-coordinates.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
cells : list
|
||||
2-D list of cell objects.
|
||||
|
||||
columns : list
|
||||
List of column x-coordinates.
|
||||
|
||||
rows : list
|
||||
List of row y-coordinates.
|
||||
"""
|
||||
def __init__(self, columns, rows):
|
||||
|
||||
self.cells = [[Cell(c[0], r[1], c[1], r[0])
|
||||
for c in columns] for r in rows]
|
||||
self.columns = columns
|
||||
self.rows = rows
|
||||
|
||||
def set_edges(self, vertical, horizontal, jtol):
|
||||
def set_edges(self, vertical, horizontal, jtol=2):
|
||||
"""Set cell edges to True if corresponding line segments
|
||||
are detected in the pdf image.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
vertical : list
|
||||
List of vertical line segments.
|
||||
|
||||
horizontal : list
|
||||
List of horizontal line segments.
|
||||
|
||||
jtol : int, default: 2, optional
|
||||
Tolerance to account for when comparing joint and line
|
||||
coordinates.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Returns self.
|
||||
"""
|
||||
for v in vertical:
|
||||
# find closest x coord
|
||||
# iterate over y coords and find closest points
|
||||
|
|
@ -117,6 +158,14 @@ class Table:
|
|||
return self
|
||||
|
||||
def set_spanning(self):
|
||||
"""Set spanning values of a cell to True if it isn't
|
||||
bounded by four edges.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Returns self.
|
||||
"""
|
||||
for i in range(len(self.cells)):
|
||||
for j in range(len(self.cells[i])):
|
||||
bound = self.cells[i][j].get_bounded_edges()
|
||||
|
|
@ -125,28 +174,38 @@ class Table:
|
|||
|
||||
elif bound == 3:
|
||||
if not self.cells[i][j].left:
|
||||
if self.cells[i][j].right and self.cells[i][j].top and self.cells[i][j].bottom:
|
||||
if (self.cells[i][j].right and
|
||||
self.cells[i][j].top and
|
||||
self.cells[i][j].bottom):
|
||||
self.cells[i][j].spanning_h = True
|
||||
|
||||
elif not self.cells[i][j].right:
|
||||
if self.cells[i][j].left and self.cells[i][j].top and self.cells[i][j].bottom:
|
||||
if (self.cells[i][j].left and
|
||||
self.cells[i][j].top and
|
||||
self.cells[i][j].bottom):
|
||||
self.cells[i][j].spanning_h = True
|
||||
|
||||
elif not self.cells[i][j].top:
|
||||
if self.cells[i][j].left and self.cells[i][j].right and self.cells[i][j].bottom:
|
||||
if (self.cells[i][j].left and
|
||||
self.cells[i][j].right and
|
||||
self.cells[i][j].bottom):
|
||||
self.cells[i][j].spanning_v = True
|
||||
|
||||
elif not self.cells[i][j].bottom:
|
||||
if self.cells[i][j].left and self.cells[i][j].right and self.cells[i][j].top:
|
||||
if (self.cells[i][j].left and
|
||||
self.cells[i][j].right and
|
||||
self.cells[i][j].top):
|
||||
self.cells[i][j].spanning_v = True
|
||||
|
||||
elif bound == 2:
|
||||
if self.cells[i][j].left and self.cells[i][j].right:
|
||||
if not self.cells[i][j].top and not self.cells[i][j].bottom:
|
||||
if (not self.cells[i][j].top and
|
||||
not self.cells[i][j].bottom):
|
||||
self.cells[i][j].spanning_v = True
|
||||
|
||||
elif self.cells[i][j].top and self.cells[i][j].bottom:
|
||||
if not self.cells[i][j].left and not self.cells[i][j].right:
|
||||
if (not self.cells[i][j].left and
|
||||
not self.cells[i][j].right):
|
||||
self.cells[i][j].spanning_h = True
|
||||
|
||||
return self
|
||||
|
|
|
|||
154
utils.py
154
utils.py
|
|
@ -2,16 +2,61 @@ import numpy as np
|
|||
|
||||
|
||||
def translate(x1, x2):
|
||||
"""Translate coordinate x2 by x1.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x1 : float
|
||||
|
||||
x2 : float
|
||||
|
||||
Returns
|
||||
-------
|
||||
x2 : float
|
||||
"""
|
||||
x2 += x1
|
||||
return x2
|
||||
|
||||
|
||||
def scale(x, s):
|
||||
"""Scale coordinate x by scaling factor s.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : float
|
||||
|
||||
s : float
|
||||
|
||||
Returns
|
||||
-------
|
||||
x : float
|
||||
"""
|
||||
x *= s
|
||||
return x
|
||||
|
||||
|
||||
def rotate(x1, y1, x2, y2, angle):
|
||||
"""Rotate point x2, y2 about point x1, y1 by angle.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x1 : float
|
||||
|
||||
y1 : float
|
||||
|
||||
x2 : float
|
||||
|
||||
y2 : float
|
||||
|
||||
angle : float
|
||||
Angle in radians.
|
||||
|
||||
Returns
|
||||
-------
|
||||
xnew : float
|
||||
|
||||
ynew : float
|
||||
"""
|
||||
s = np.sin(angle)
|
||||
c = np.cos(angle)
|
||||
x2 = translate(-x1, x2)
|
||||
|
|
@ -23,7 +68,20 @@ def rotate(x1, y1, x2, y2, angle):
|
|||
return xnew, ynew
|
||||
|
||||
|
||||
def remove_close_values(ar, mtol):
|
||||
def remove_close_values(ar, mtol=2):
|
||||
"""Remove values which are within a tolerance of mtol of another value
|
||||
present in list.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
ar : list
|
||||
|
||||
mtol : int, default: 2, optional
|
||||
|
||||
Returns
|
||||
-------
|
||||
ret : list
|
||||
"""
|
||||
ret = []
|
||||
for a in ar:
|
||||
if not ret:
|
||||
|
|
@ -37,7 +95,20 @@ def remove_close_values(ar, mtol):
|
|||
return ret
|
||||
|
||||
|
||||
def merge_close_values(ar, mtol):
|
||||
def merge_close_values(ar, mtol=2):
|
||||
"""Merge values which are within a tolerance of mtol by calculating
|
||||
a moving mean.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
ar : list
|
||||
|
||||
mtol : int, default: 2, optional
|
||||
|
||||
Returns
|
||||
-------
|
||||
ret : list
|
||||
"""
|
||||
ret = []
|
||||
for a in ar:
|
||||
if not ret:
|
||||
|
|
@ -53,18 +124,63 @@ def merge_close_values(ar, mtol):
|
|||
|
||||
|
||||
def get_row_idx(t, rows):
|
||||
"""Get index of the row in which the given object falls by
|
||||
comparing their co-ordinates.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
t : object
|
||||
|
||||
rows : list
|
||||
|
||||
Returns
|
||||
-------
|
||||
r : int
|
||||
"""
|
||||
for r in range(len(rows)):
|
||||
if (t.y0 + t.y1) / 2.0 < rows[r][0] and (t.y0 + t.y1) / 2.0 > rows[r][1]:
|
||||
return r
|
||||
|
||||
|
||||
def get_column_idx(t, columns):
|
||||
"""Get index of the column in which the given object falls by
|
||||
comparing their co-ordinates.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
t : object
|
||||
|
||||
columns : list
|
||||
|
||||
Returns
|
||||
-------
|
||||
c : int
|
||||
"""
|
||||
for c in range(len(columns)):
|
||||
if (t.x0 + t.x1) / 2.0 > columns[c][0] and (t.x0 + t.x1) / 2.0 < columns[c][1]:
|
||||
return c
|
||||
|
||||
|
||||
def reduce_index(t, rotated, r_idx, c_idx):
|
||||
"""Shift a text object if it lies within a spanning cell taking
|
||||
in account table rotation.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
t : object
|
||||
|
||||
rotated : string
|
||||
|
||||
r_idx : int
|
||||
|
||||
c_idx : int
|
||||
|
||||
Returns
|
||||
-------
|
||||
r_idx : int
|
||||
|
||||
c_idx : int
|
||||
"""
|
||||
if not rotated:
|
||||
if t.cells[r_idx][c_idx].spanning_h:
|
||||
while not t.cells[r_idx][c_idx].left:
|
||||
|
|
@ -90,6 +206,16 @@ def reduce_index(t, rotated, r_idx, c_idx):
|
|||
|
||||
|
||||
def outline(t):
|
||||
"""Light up table boundary.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
t : object
|
||||
|
||||
Returns
|
||||
-------
|
||||
t : object
|
||||
"""
|
||||
for i in range(len(t.cells)):
|
||||
t.cells[i][0].left = True
|
||||
t.cells[i][len(t.cells[i]) - 1].right = True
|
||||
|
|
@ -99,7 +225,19 @@ def outline(t):
|
|||
return t
|
||||
|
||||
|
||||
def fill(t, f):
|
||||
def fill(t, f=None):
|
||||
"""Fill spanning cells.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
t : object
|
||||
|
||||
f : string, default: None, optional
|
||||
|
||||
Returns
|
||||
-------
|
||||
t : object
|
||||
"""
|
||||
if f == "h":
|
||||
for i in range(len(t.cells)):
|
||||
for j in range(len(t.cells[i])):
|
||||
|
|
@ -124,6 +262,16 @@ def fill(t, f):
|
|||
|
||||
|
||||
def remove_empty(d):
|
||||
"""Remove empty rows and columns.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
d : list
|
||||
|
||||
Returns
|
||||
-------
|
||||
d : list
|
||||
"""
|
||||
for i, row in enumerate(d):
|
||||
if row == [''] * len(row):
|
||||
d.pop(i)
|
||||
|
|
|
|||
Loading…
Reference in New Issue